diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 37b70c58062b9..2986925f8820f 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -64,7 +64,7 @@ elif [[ "$DISTRIB" == "conda-pip-latest" ]]; then make_conda "python=$PYTHON_VERSION" python -m pip install -U pip - python -m pip install pandas matplotlib pyamg scikit-image + python -m pip install pandas matplotlib pyamg scikit-image xarray sparse # do not install dependencies for lightgbm since it requires scikit-learn # and install a version less than 3.0.0 until the issue #18316 is solved. python -m pip install "lightgbm<3.0.0" --no-deps diff --git a/sklearn/_config.py b/sklearn/_config.py index f77d6582d0d04..af42ac1ef3347 100644 --- a/sklearn/_config.py +++ b/sklearn/_config.py @@ -8,6 +8,7 @@ 'working_memory': int(os.environ.get('SKLEARN_WORKING_MEMORY', 1024)), 'print_changed_only': True, 'display': 'text', + 'array_out': 'default', } @@ -28,7 +29,7 @@ def get_config(): def set_config(assume_finite=None, working_memory=None, - print_changed_only=None, display=None): + print_changed_only=None, display=None, array_out=None): """Set global scikit-learn configuration .. versionadded:: 0.19 @@ -67,6 +68,9 @@ def set_config(assume_finite=None, working_memory=None, .. versionadded:: 0.23 + array_out : {'default', 'pandas', 'xarray'}, optional + Kind of array output for transformers + See Also -------- config_context: Context manager for global scikit-learn configuration @@ -80,6 +84,8 @@ def set_config(assume_finite=None, working_memory=None, _global_config['print_changed_only'] = print_changed_only if display is not None: _global_config['display'] = display + if array_out is not None: + _global_config['array_out'] = array_out @contextmanager diff --git a/sklearn/base.py b/sklearn/base.py index f7a3116f221c4..21cca90846635 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -6,6 +6,7 @@ import copy import warnings from collections import defaultdict +from functools import partial import platform import inspect import re @@ -19,6 +20,9 @@ from .utils.validation import check_array from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args +from .utils._array_out import _get_feature_names +from .utils._array_out import _make_array_out + _DEFAULT_TAGS = { 'non_deterministic': False, @@ -377,6 +381,33 @@ def _check_n_features(self, X, reset): self.n_features_in_) ) + def _check_feature_names(self, X, reset=True): + """Set the `feature_names_in_` attribute, or check against it. + + Parameters + ---------- + X : {ndarray, sparse matrix} of shape (n_samples, n_features) + The input samples. + reset : bool, default=True + If True, the `n_feature_names_` attribute is set to the feature + names of `X`. + Else, the attribute must already exist and the function checks + that it is equal to the feature names of `X`. + """ + feature_names = _get_feature_names(X) + if reset: + self.feature_names_in_ = feature_names + return + + if (not hasattr(self, 'feature_names_in_') or + self.feature_names_in_ is None or + feature_names is None): + return + + if np.any(feature_names != self.feature_names_in_): + raise ValueError("The feature names of X does not match the " + "feature_names_in_ attribute") + def _validate_data(self, X, y=None, reset=True, validate_separately=False, **check_params): """Validate input data and set or check the `n_features_in_` attribute. @@ -407,7 +438,7 @@ def _validate_data(self, X, y=None, reset=True, out : {ndarray, sparse matrix} or tuple of these The validated input. A tuple is returned if `y` is not None. """ - + self._check_feature_names(X, reset=reset) if y is None: if self._get_tags()['requires_y']: raise ValueError( @@ -462,6 +493,74 @@ def _repr_mimebundle_(self, **kwargs): output["text/html"] = estimator_html_repr(self) return output + def _make_array_out(self, X_out, X_orig, get_feature_names_out): + """Construct array container based on global configuration. + + Parameters + ---------- + X_out: {ndarray, sparse matrix} of shape (n_samples, n_features_out) + Output data to be wrapped. + + X_orig: array-like of shape (n_samples, n_features) + Original input data. For panda's DataFrames, this is used to get + the index. For xarray's DataArrays, this is used to get the name + of the dims and the coordinates for the first dims. + + get_feature_names_out: callable or {'one_to_one', 'class_name'} + Called to get the feature names out. If `one_to_one`, then the + feature_names_in will be used as feature name out. If `class_name`, + then the class name will be used as prefixes for the feature names + out. + + Return + ------ + array_out: {ndarray, sparse matrix, dataframe, dataarray} of shape \ + (n_samples, n_features_out) + Wrapped array with feature names. + """ + array_out = get_config()['array_out'] + if array_out == 'default': + return X_out + + # TODO This can be removed when all estimators use `_validate_data` + # in transform to check for feature names + self._check_feature_names(X_orig, reset=False) + + if callable(get_feature_names_out): + get_feature_names_out_callable = get_feature_names_out + elif get_feature_names_out == 'one_to_one': + def get_feature_names_out_callable(names): + return names + else: + # get_feature_names_out == 'class_name' + class_name = self.__class__.__name__.lower() + + def get_feature_names_out_callable(): + return np.array([f"{class_name}{i}" + for i in range(X_out.shape[1])]) + + # feature names in can have zero or one argument. For one argument + # it would be the input feature names + parameters = (inspect.signature(get_feature_names_out_callable) + .parameters) + if parameters: + if hasattr(self, "feature_names_in_"): + feature_names_in = self.feature_names_in_ + else: + # If there are no feature_names_in_ attribute use the + # feature names from the input are feature names + feature_names_in = _get_feature_names(X_orig) + + # If there no feature names at this point, generate the + # feature names for the input features + if feature_names_in is None: + feature_names_in = np.array( + [f'X{i}' for i in range(self.n_features_in_)]) + get_feature_names_out_callable = partial( + get_feature_names_out_callable, feature_names_in) + + return _make_array_out(X_out, X_orig, get_feature_names_out_callable) + class ClassifierMixin: """Mixin class for all classifiers in scikit-learn.""" diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py index 98ca69ea3dd33..23e6983849ab0 100644 --- a/sklearn/cluster/_agglomerative.py +++ b/sklearn/cluster/_agglomerative.py @@ -1100,8 +1100,10 @@ def fit(self, X, y=None, **params): # save n_features_in_ attribute here to reset it after, because it will # be overridden in AgglomerativeClustering since we passed it X.T. n_features_in_ = self.n_features_in_ + feature_names_in_ = self.feature_names_in_ AgglomerativeClustering.fit(self, X.T, **params) self.n_features_in_ = n_features_in_ + self.feature_names_in_ = feature_names_in_ return self @property diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py index b1f046d51c22e..dbc53e2dedbf5 100644 --- a/sklearn/cluster/_birch.py +++ b/sklearn/cluster/_birch.py @@ -614,7 +614,8 @@ def transform(self, X): Transformed data. """ check_is_fitted(self) - return euclidean_distances(X, self.subcluster_centers_) + out = euclidean_distances(X, self.subcluster_centers_) + return self._make_array_out(out, X, 'class_name') def _global_clustering(self, X=None): """ diff --git a/sklearn/cluster/_feature_agglomeration.py b/sklearn/cluster/_feature_agglomeration.py index 1366971466f6a..3dbf2f922aef6 100644 --- a/sklearn/cluster/_feature_agglomeration.py +++ b/sklearn/cluster/_feature_agglomeration.py @@ -37,6 +37,7 @@ def transform(self, X): The pooled values for each feature cluster. """ check_is_fitted(self) + X_orig = X X = check_array(X) if len(self.labels_) != X.shape[1]: @@ -52,7 +53,7 @@ def transform(self, X): nX = [self.pooling_func(X[:, self.labels_ == l], axis=1) for l in np.unique(self.labels_)] nX = np.array(nX).T - return nX + return self._make_array_out(nX, X_orig, 'class_name') def inverse_transform(self, Xred): """ diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py index 5b959e0f048d2..d4748528158bd 100644 --- a/sklearn/cluster/_kmeans.py +++ b/sklearn/cluster/_kmeans.py @@ -1073,7 +1073,8 @@ def fit_transform(self, X, y=None, sample_weight=None): # np.array or CSR format already. # XXX This skips _check_test_data, which may change the dtype; # we should refactor the input validation. - return self.fit(X, sample_weight=sample_weight)._transform(X) + out = self.fit(X, sample_weight=sample_weight)._transform(X) + return self._make_array_out(out, X, 'class_name') def transform(self, X): """Transform X to a cluster-distance space. @@ -1093,9 +1094,9 @@ def transform(self, X): X transformed in the new space. """ check_is_fitted(self) - + X_orig = X X = self._check_test_data(X) - return self._transform(X) + return self._make_array_out(self._transform(X), X_orig, 'class_name') def _transform(self, X): """guts of transform method; no input validation""" diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py index 3a731e7f3881a..03ce16da5c64e 100644 --- a/sklearn/compose/_column_transformer.py +++ b/sklearn/compose/_column_transformer.py @@ -16,6 +16,7 @@ from ..base import clone, TransformerMixin from ..utils._estimator_html_repr import _VisualBlock +from ..utils._array_out import _get_feature_names from ..pipeline import _fit_transform_one, _transform_one, _name_estimators from ..preprocessing import FunctionTransformer from ..utils import Bunch @@ -25,6 +26,7 @@ from ..utils.metaestimators import _BaseComposition from ..utils.validation import check_array, check_is_fitted from ..utils.validation import _deprecate_positional_args +from .._config import get_config __all__ = [ @@ -424,7 +426,7 @@ def _validate_features(self, n_features, feature_names): TODO: It should raise an error from v0.24 """ - if ((self._feature_names_in is None or feature_names is None) + if ((self.feature_names_in_ is None or feature_names is None) and self._n_features == n_features): return @@ -439,7 +441,7 @@ def _validate_features(self, n_features, feature_names): "transform have the same number of columns.") if (self._n_features != n_features or - np.any(self._feature_names_in != np.asarray(feature_names))): + np.any(self.feature_names_in_ != np.asarray(feature_names))): warnings.warn("Given feature/column names or counts do not match " "the ones for the data given during fit. This will " "fail from v0.24.", @@ -468,7 +470,8 @@ def _fit_transform(self, X, y, func, fitted=False): y=y, weight=weight, message_clsname='ColumnTransformer', - message=self._log_message(name, idx, len(transformers))) + message=self._log_message(name, idx, len(transformers)), + config=get_config()) for idx, (name, trans, column, weight) in enumerate( self._iter(fitted=fitted, replace_strings=True), 1)) except ValueError as e: @@ -522,11 +525,8 @@ def fit_transform(self, X, y=None): sparse matrices. """ - # TODO: this should be `feature_names_in_` when we start having it - if hasattr(X, "columns"): - self._feature_names_in = np.asarray(X.columns) - else: - self._feature_names_in = None + self._check_feature_names(X) + X_orig = X X = _check_X(X) # set n_features_in_ attribute self._check_n_features(X, reset=True) @@ -544,19 +544,49 @@ def fit_transform(self, X, y=None): Xs, transformers = zip(*result) # determine if concatenated output will be sparse or not - if any(sparse.issparse(X) for X in Xs): - nnz = sum(X.nnz if sparse.issparse(X) else X.size for X in Xs) - total = sum(X.shape[0] * X.shape[1] if sparse.issparse(X) - else X.size for X in Xs) - density = nnz / total - self.sparse_output_ = density < self.sparse_threshold - else: - self.sparse_output_ = False - + self._check_sparse_output(Xs) self._update_fitted_transformers(transformers) self._validate_output(Xs) - return self._hstack(list(Xs)) + output = self._hstack(Xs) + return self._make_array_out(output, Xs, X_orig) + + def _check_sparse_output(self, Xs): + def _get_Xtype(X): + # pandas sparse dataframe + if hasattr(X, "iloc") and hasattr(X, "sparse"): + return 'pd' + # xarray sparse + if hasattr(X, 'data') and hasattr(X.data, 'to_scipy_sparse'): + return 'xr' + if sparse.issparse(X): + return 'sp' + return 'dense' + + Xs_types = [(X, _get_Xtype(X)) for X in Xs] + + # all dense + if all(X_type == 'dense' for _, X_type in Xs_types): + self.sparse_output_ = False + return + + nnz = 0.0 + total = 0.0 + for X, X_type in Xs_types: + if X_type == 'pd': + nnz += X.sparse.density * X.size + total += X.size + elif X_type == 'sp': + nnz += X.nnz + total += np.prod(X.shape) + elif X_type == 'xr': + nnz += X.data.nnz + total += X.data.size + else: + nnz += X.size + total += X.size + density = nnz / total + self.sparse_output_ = density < self.sparse_threshold def transform(self, X): """Transform X separately by each transformer, concatenate results. @@ -577,6 +607,7 @@ def transform(self, X): """ check_is_fitted(self) + X_orig = X X = _check_X(X) if hasattr(X, "columns"): X_feature_names = np.asarray(X.columns) @@ -614,7 +645,25 @@ def transform(self, X): # All transformers are None return np.zeros((X.shape[0], 0)) - return self._hstack(list(Xs)) + output = self._hstack(Xs) + return self._make_array_out(output, Xs, X_orig) + + def _make_array_out(self, X_out, Xs, X_orig): + def get_feature_names_out(): + transformer_names = (name for name, _, _, _ in self._iter()) + feature_names = [] + for X, trans_name in zip(Xs, transformer_names): + inner_names = _get_feature_names(X) + inner_names = [f'{trans_name}_{name}' for name in inner_names] + feature_names.append(inner_names) + + feature_names_out = np.concatenate(feature_names) + if feature_names_out.size != X_out.shape[1]: + return None + return feature_names_out + + return super()._make_array_out(X_out, X_orig, + get_feature_names_out) def _hstack(self, Xs): """Stacks Xs horizontally. @@ -643,8 +692,16 @@ def _hstack(self, Xs): return sparse.hstack(converted_Xs).tocsr() else: - Xs = [f.toarray() if sparse.issparse(f) else f for f in Xs] - return np.hstack(Xs) + output = [] + for X in Xs: + # xarray sparse + if hasattr(X, 'coords') and hasattr(X.data, "todense"): + output.append(X.data.todense()) + elif sparse.issparse(X): + output.append(X.toarray()) + else: + output.append(X) + return np.hstack(output) def _sk_visual_block_(self): if isinstance(self.remainder, str) and self.remainder == 'drop': diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py index 4e58769e244c7..a362b5d94e6f1 100644 --- a/sklearn/compose/tests/test_column_transformer.py +++ b/sklearn/compose/tests/test_column_transformer.py @@ -16,6 +16,7 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.base import BaseEstimator +from sklearn import config_context from sklearn.compose import ( ColumnTransformer, make_column_transformer, make_column_selector ) @@ -1433,3 +1434,32 @@ def test_sk_visual_block_remainder_fitted_numpy(remainder): assert visual_block.names == ('scale', 'remainder') assert visual_block.name_details == ([0, 2], [1]) assert visual_block.estimators == (scaler, remainder) + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +def test_column_transformer_array_out_pandas(n_jobs): + # simple check for column transformer and array_out + pd = pytest.importorskip('pandas') + df = pd.DataFrame({"cat": ["a", "b", "c"], + "num": [1, 2, 3], + 'more': [3, 4, 5]}) + ct = ColumnTransformer(transformers=[ + ('scale', StandardScaler(), ['num']), + ('encode', OneHotEncoder(), ['cat']) + ], n_jobs=n_jobs) + with config_context(array_out='pandas'): + df_out = ct.fit_transform(df) + assert_array_equal(df_out.columns, + ['scale_num', 'encode_cat_a', + 'encode_cat_b', 'encode_cat_c']) + + ct = ColumnTransformer(transformers=[ + ('scale', StandardScaler(), ['num']), + ('encode', OneHotEncoder(), ['cat']) + ], n_jobs=n_jobs, remainder='passthrough') + with config_context(array_out='pandas'): + df_out = ct.fit_transform(df) + assert_array_equal(df_out.columns, + ['scale_num', 'encode_cat_a', + 'encode_cat_b', 'encode_cat_c', + 'remainder_more']) diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py index 70d50397021ef..eb4aa1e3450c9 100644 --- a/sklearn/cross_decomposition/_pls.py +++ b/sklearn/cross_decomposition/_pls.py @@ -316,12 +316,15 @@ def transform(self, X, Y=None, copy=True): x_scores if Y is not given, (x_scores, y_scores) otherwise. """ check_is_fitted(self) + X_orig = X X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) # Normalize X -= self.x_mean_ X /= self.x_std_ # Apply rotation x_scores = np.dot(X, self.x_rotations_) + + x_scores = self._make_array_out(x_scores, X_orig, 'class_name') if Y is not None: Y = check_array(Y, ensure_2d=False, copy=copy, dtype=FLOAT_DTYPES) if Y.ndim == 1: @@ -925,9 +928,11 @@ def transform(self, X, Y=None): (X_transformed, Y_transformed) otherwise """ check_is_fitted(self) + X_orig = X X = check_array(X, dtype=np.float64) Xr = (X - self.x_mean_) / self.x_std_ x_scores = np.dot(Xr, self.x_weights_) + x_scores = self._make_array_out(x_scores, X_orig, 'class_name') if Y is not None: Y = check_array(Y, ensure_2d=False, dtype=np.float64) if Y.ndim == 1: diff --git a/sklearn/decomposition/_base.py b/sklearn/decomposition/_base.py index e89a05051404b..d16bef85eb81a 100644 --- a/sklearn/decomposition/_base.py +++ b/sklearn/decomposition/_base.py @@ -123,6 +123,7 @@ def transform(self, X): >>> ipca.transform(X) # doctest: +SKIP """ check_is_fitted(self) + X_orig = X X = check_array(X) if self.mean_ is not None: @@ -130,7 +131,8 @@ def transform(self, X): X_transformed = np.dot(X, self.components_.T) if self.whiten: X_transformed /= np.sqrt(self.explained_variance_) - return X_transformed + + return self._make_array_out(X_transformed, X_orig, 'class_name') def inverse_transform(self, X): """Transform data back to its original space. diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py index 7c804b10e8ec1..5293e097044ac 100644 --- a/sklearn/decomposition/_dict_learning.py +++ b/sklearn/decomposition/_dict_learning.py @@ -906,6 +906,7 @@ def __init__(self, transform_algorithm, transform_n_nonzero_coefs, def _transform(self, X, dictionary): """Private method allowing to accomodate both DictionaryLearning and SparseCoder.""" + X_orig = X X = check_array(X) code = sparse_encode( @@ -922,7 +923,7 @@ def _transform(self, X, dictionary): split_code[:, n_features:] = -np.minimum(code, 0) code = split_code - return code + return self._make_array_out(code, X_orig, 'class_name') def transform(self, X): """Encode the data as a sparse combination of the dictionary atoms. diff --git a/sklearn/decomposition/_factor_analysis.py b/sklearn/decomposition/_factor_analysis.py index 180a3fbcc858b..287f6b75607f4 100644 --- a/sklearn/decomposition/_factor_analysis.py +++ b/sklearn/decomposition/_factor_analysis.py @@ -278,7 +278,7 @@ def transform(self, X): The latent variables of X. """ check_is_fitted(self) - + X_orig = X X = check_array(X) Ih = np.eye(len(self.components_)) @@ -289,7 +289,7 @@ def transform(self, X): tmp = np.dot(X_transformed, Wpsi.T) X_transformed = np.dot(tmp, cov_z) - return X_transformed + return self._make_array_out(X_transformed, X_orig, 'class_name') def get_covariance(self): """Compute data covariance with the FactorAnalysis model. diff --git a/sklearn/decomposition/_fastica.py b/sklearn/decomposition/_fastica.py index e2a4cd55058e7..c715aec378036 100644 --- a/sklearn/decomposition/_fastica.py +++ b/sklearn/decomposition/_fastica.py @@ -557,7 +557,8 @@ def fit_transform(self, X, y=None): ------- X_new : ndarray of shape (n_samples, n_components) """ - return self._fit(X, compute_sources=True) + out = self._fit(X, compute_sources=True) + return self._make_array_out(out, X, 'class_name') def fit(self, X, y=None): """Fit the model to X. @@ -594,12 +595,14 @@ def transform(self, X, copy=True): X_new : ndarray of shape (n_samples, n_components) """ check_is_fitted(self) + X_orig = X X = check_array(X, copy=copy, dtype=FLOAT_DTYPES) if self.whiten: X -= self.mean_ - return np.dot(X, self.components_.T) + return self._make_array_out(np.dot(X, self.components_.T), X_orig, + 'class_name') def inverse_transform(self, X, copy=True): """Transform the sources back to the mixed data (apply mixing matrix). diff --git a/sklearn/decomposition/_kernel_pca.py b/sklearn/decomposition/_kernel_pca.py index debd697ddf6a5..a5374471f856c 100644 --- a/sklearn/decomposition/_kernel_pca.py +++ b/sklearn/decomposition/_kernel_pca.py @@ -312,7 +312,7 @@ def fit_transform(self, X, y=None, **params): if self.fit_inverse_transform: self._fit_inverse_transform(X_transformed, X) - return X_transformed + return self._make_array_out(X_transformed, X, 'class_name') def transform(self, X): """Transform X. @@ -337,7 +337,7 @@ def transform(self, X): / np.sqrt(self.lambdas_[non_zeros])) # Project with a scalar product between K and the scaled eigenvectors - return np.dot(K, scaled_alphas) + return self._make_array_out(np.dot(K, scaled_alphas), X, 'class_name') def inverse_transform(self, X): """Transform X back to original space. diff --git a/sklearn/decomposition/_lda.py b/sklearn/decomposition/_lda.py index d3c6fe2e57cd4..ac019615bf6ec 100644 --- a/sklearn/decomposition/_lda.py +++ b/sklearn/decomposition/_lda.py @@ -632,7 +632,7 @@ def _unnormalized_transform(self, X): # make sure feature size is the same in fitted model and in X X = self._check_non_neg_array( - X, reset_n_features=True, + X, reset_n_features=False, whom="LatentDirichletAllocation.transform") n_samples, n_features = X.shape if n_features != self.components_.shape[1]: @@ -664,7 +664,7 @@ def transform(self, X): """ doc_topic_distr = self._unnormalized_transform(X) doc_topic_distr /= doc_topic_distr.sum(axis=1)[:, np.newaxis] - return doc_topic_distr + return self._make_array_out(doc_topic_distr, X, 'class_name') def _approx_bound(self, X, doc_topic_distr, sub_sampling): """Estimate the variational bound. @@ -757,7 +757,7 @@ def score(self, X, y=None): score : float Use approximate bound as score. """ - X = self._check_non_neg_array(X, reset_n_features=True, + X = self._check_non_neg_array(X, reset_n_features=False, whom="LatentDirichletAllocation.score") doc_topic_distr = self._unnormalized_transform(X) @@ -789,7 +789,7 @@ def _perplexity_precomp_distr(self, X, doc_topic_distr=None, check_is_fitted(self) X = self._check_non_neg_array( - X, reset_n_features=True, + X, reset_n_features=False, whom="LatentDirichletAllocation.perplexity") if doc_topic_distr is None: diff --git a/sklearn/decomposition/_nmf.py b/sklearn/decomposition/_nmf.py index 4eaf9c29e5703..6cf610ffb6136 100644 --- a/sklearn/decomposition/_nmf.py +++ b/sklearn/decomposition/_nmf.py @@ -1296,6 +1296,7 @@ def fit_transform(self, X, y=None, W=None, H=None): W : ndarray of shape (n_samples, n_components) Transformed data. """ + X_orig = X X = self._validate_data(X, accept_sparse=('csr', 'csc'), dtype=[np.float64, np.float32]) @@ -1314,7 +1315,7 @@ def fit_transform(self, X, y=None, W=None, H=None): self.components_ = H self.n_iter_ = n_iter_ - return W + return self._make_array_out(W, X_orig, 'class_name') def fit(self, X, y=None, **params): """Learn a NMF model for the data X. @@ -1347,6 +1348,7 @@ def transform(self, X): Transformed data. """ check_is_fitted(self) + X_orig = X W, _, n_iter_ = non_negative_factorization( X=X, W=None, H=self.components_, n_components=self.n_components_, @@ -1357,7 +1359,7 @@ def transform(self, X): random_state=self.random_state, verbose=self.verbose, shuffle=self.shuffle) - return W + return self._make_array_out(W, X_orig, 'class_name') def inverse_transform(self, W): """Transform data back to its original space. diff --git a/sklearn/decomposition/_pca.py b/sklearn/decomposition/_pca.py index 69cc8f159b9f2..d20a3f3731368 100644 --- a/sklearn/decomposition/_pca.py +++ b/sklearn/decomposition/_pca.py @@ -373,6 +373,7 @@ def fit_transform(self, X, y=None): This method returns a Fortran-ordered array. To convert it to a C-ordered array, use 'np.ascontiguousarray'. """ + X_orig = X U, S, Vt = self._fit(X) U = U[:, :self.n_components_] @@ -383,7 +384,7 @@ def fit_transform(self, X, y=None): # X_new = X * V = U * S * Vt * V = U * S U *= S[:self.n_components_] - return U + return self._make_array_out(U, X_orig, 'class_name') def _fit(self, X): """Dispatch to the right submethod depending on the chosen solver.""" diff --git a/sklearn/decomposition/_sparse_pca.py b/sklearn/decomposition/_sparse_pca.py index f477c08a739c8..ed17ad1940701 100644 --- a/sklearn/decomposition/_sparse_pca.py +++ b/sklearn/decomposition/_sparse_pca.py @@ -196,14 +196,14 @@ def transform(self, X): Transformed data. """ check_is_fitted(self) - + X_orig = X X = check_array(X) X = X - self.mean_ U = ridge_regression(self.components_.T, X.T, self.ridge_alpha, solver='cholesky') - return U + return self._make_array_out(U, X_orig, 'class_name') class MiniBatchSparsePCA(SparsePCA): diff --git a/sklearn/decomposition/_truncated_svd.py b/sklearn/decomposition/_truncated_svd.py index 5ff6f6ec010da..a91ee368e9dec 100644 --- a/sklearn/decomposition/_truncated_svd.py +++ b/sklearn/decomposition/_truncated_svd.py @@ -160,6 +160,7 @@ def fit_transform(self, X, y=None): X_new : ndarray of shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ + X_orig = X X = self._validate_data(X, accept_sparse=['csr', 'csc'], ensure_min_features=2) random_state = check_random_state(self.random_state) @@ -196,7 +197,7 @@ def fit_transform(self, X, y=None): self.explained_variance_ratio_ = exp_var / full_var self.singular_values_ = Sigma # Store the singular values. - return X_transformed + return self._make_array_out(X_transformed, X_orig, 'class_name') def transform(self, X): """Perform dimensionality reduction on X. @@ -211,9 +212,11 @@ def transform(self, X): X_new : ndarray of shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ + X_orig = X X = check_array(X, accept_sparse=['csr', 'csc']) check_is_fitted(self) - return safe_sparse_dot(X, self.components_.T) + return self._make_array_out(safe_sparse_dot(X, self.components_.T), + X_orig, 'class_name') def inverse_transform(self, X): """Transform X back to its original space. diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py index b4935306fa0fa..84cef0767a2ae 100644 --- a/sklearn/discriminant_analysis.py +++ b/sklearn/discriminant_analysis.py @@ -495,13 +495,15 @@ def transform(self, X): "solver (use 'svd' or 'eigen').") check_is_fitted(self) + X_orig = X X = check_array(X) if self.solver == 'svd': X_new = np.dot(X - self.xbar_, self.scalings_) elif self.solver == 'eigen': X_new = np.dot(X, self.scalings_) - return X_new[:, :self._max_components] + return self._make_array_out(X_new[:, :self._max_components], X_orig, + 'class_name') def predict_proba(self, X): """Estimate probability. diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 4e0efcee13b27..6948a830af1a7 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1185,6 +1185,7 @@ def fit_transform(self, raw_documents, y=None): self._validate_params() self._validate_vocabulary() + max_df = self.max_df min_df = self.min_df max_features = self.max_features @@ -1216,7 +1217,7 @@ def fit_transform(self, raw_documents, y=None): X = self._sort_features(X, vocabulary) self.vocabulary_ = vocabulary - return X + return self._make_array_out(X, raw_documents, self.get_feature_names) def transform(self, raw_documents): """Transform documents to document-term matrix. @@ -1239,12 +1240,13 @@ def transform(self, raw_documents): "Iterable over raw text documents expected, " "string object received.") self._check_vocabulary() - # use the same matrix-building strategy as fit_transform _, X = self._count_vocab(raw_documents, fixed_vocab=True) + if self.binary: X.data.fill(1) - return X + + return self._make_array_out(X, raw_documents, self.get_feature_names) def inverse_transform(self, X): """Return terms per document with nonzero entries in X. @@ -1423,7 +1425,7 @@ def fit(self, X, y=None): X : sparse matrix of shape n_samples, n_features) A matrix of term/token counts. """ - X = check_array(X, accept_sparse=('csr', 'csc')) + X = self._validate_data(X, accept_sparse=('csr', 'csc')) if not sp.issparse(X): X = sp.csr_matrix(X) dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64 @@ -1463,6 +1465,7 @@ def transform(self, X, copy=True): ------- vectors : sparse matrix of shape (n_samples, n_features) """ + X_orig = X X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy) if not sp.issparse(X): X = sp.csr_matrix(X, dtype=np.float64) @@ -1491,7 +1494,7 @@ def transform(self, X, copy=True): if self.norm: X = normalize(X, norm=self.norm, copy=False) - return X + return self._make_array_out(X, X_orig, 'one_to_one') @property def idf_(self): diff --git a/sklearn/feature_selection/_base.py b/sklearn/feature_selection/_base.py index a5d752cb3f4b6..9249c168130b4 100644 --- a/sklearn/feature_selection/_base.py +++ b/sklearn/feature_selection/_base.py @@ -74,6 +74,7 @@ def transform(self, X): X_r : array of shape [n_samples, n_selected_features] The input samples with only the selected features. """ + X_orig = X tags = self._get_tags() X = check_array(X, dtype=None, accept_sparse='csr', force_all_finite=not tags.get('allow_nan', True)) @@ -82,10 +83,18 @@ def transform(self, X): warn("No features were selected: either the data is" " too noisy or the selection test too strict.", UserWarning) - return np.empty(0).reshape((X.shape[0], 0)) + out = np.empty(0).reshape((X.shape[0], 0)) + return self._make_array_out(out, X_orig, 'class_name') if len(mask) != X.shape[1]: raise ValueError("X has a different shape than during fitting.") - return X[:, safe_mask(X, mask)] + + _safe_mask = safe_mask(X, mask) + + def get_feature_names_out(feature_names_in): + return feature_names_in[_safe_mask] + + out = X[:, _safe_mask] + return self._make_array_out(out, X_orig, get_feature_names_out) def inverse_transform(self, X): """ diff --git a/sklearn/feature_selection/_from_model.py b/sklearn/feature_selection/_from_model.py index 17fb5425e67af..d7ac3f66ce98e 100644 --- a/sklearn/feature_selection/_from_model.py +++ b/sklearn/feature_selection/_from_model.py @@ -229,6 +229,7 @@ def fit(self, X, y=None, **fit_params): if self.prefit: raise NotFittedError( "Since 'prefit=True', call transform directly") + self._check_feature_names(X) self.estimator_ = clone(self.estimator) self.estimator_.fit(X, y, **fit_params) return self @@ -264,7 +265,9 @@ def partial_fit(self, X, y=None, **fit_params): raise NotFittedError( "Since 'prefit=True', call transform directly") if not hasattr(self, "estimator_"): + self._check_feature_names(X) self.estimator_ = clone(self.estimator) + self.estimator_.partial_fit(X, y, **fit_params) return self diff --git a/sklearn/feature_selection/tests/test_base.py b/sklearn/feature_selection/tests/test_base.py index 9515bdc32c600..b198740ae2b98 100644 --- a/sklearn/feature_selection/tests/test_base.py +++ b/sklearn/feature_selection/tests/test_base.py @@ -6,7 +6,6 @@ from sklearn.base import BaseEstimator from sklearn.feature_selection._base import SelectorMixin -from sklearn.utils import check_array class StepSelector(SelectorMixin, BaseEstimator): @@ -15,7 +14,7 @@ def __init__(self, step=2): self.step = step def fit(self, X, y=None): - X = check_array(X, accept_sparse='csc') + X = self._validate_data(X, accept_sparse='csc') self.n_input_feats = X.shape[1] return self diff --git a/sklearn/impute/_base.py b/sklearn/impute/_base.py index e1334d1980fa0..40c1a21d33160 100644 --- a/sklearn/impute/_base.py +++ b/sklearn/impute/_base.py @@ -4,6 +4,7 @@ import numbers import warnings +from functools import partial import numpy as np import numpy.ma as ma @@ -110,6 +111,15 @@ def _concatenate_indicator(self, X_imputed, X_indicator): return hstack((X_imputed, X_indicator)) + def _get_feature_names_out(self, feature_names_in, valid_mask): + imputed_names = feature_names_in[valid_mask] + if self.indicator_ is None: + return imputed_names + + indicator_names = self.indicator_._get_feature_names_out( + feature_names_in) + return np.r_[imputed_names, indicator_names] + def _more_tags(self): return {'allow_nan': is_scalar_nan(self.missing_values)} @@ -424,6 +434,7 @@ def transform(self, X): The input data to complete. """ check_is_fitted(self) + X_orig = X X = self._validate_input(X, in_fit=False) statistics = self.statistics_ @@ -486,7 +497,10 @@ def transform(self, X): X_indicator = super()._transform_indicator(missing_mask) - return super()._concatenate_indicator(X, X_indicator) + out = super()._concatenate_indicator(X, X_indicator) + get_feature_names_out = partial(self._get_feature_names_out, + valid_mask=valid_mask) + return self._make_array_out(out, X_orig, get_feature_names_out) def inverse_transform(self, X): """Convert the data back to the original representation. @@ -790,6 +804,7 @@ def transform(self, X): """ check_is_fitted(self) + X_orig = X # Need not validate X again as it would have already been validated # in the Imputer calling MissingIndicator if not self._precomputed: @@ -815,7 +830,8 @@ def transform(self, X): if self.features_.size < self._n_features: imputer_mask = imputer_mask[:, self.features_] - return imputer_mask + return self._make_array_out(imputer_mask, X_orig, + self._get_feature_names_out) def fit_transform(self, X, y=None): """Generate missing values indicator for X. @@ -838,7 +854,19 @@ def fit_transform(self, X, y=None): if self.features_.size < self._n_features: imputer_mask = imputer_mask[:, self.features_] - return imputer_mask + return self._make_array_out(imputer_mask, X, + self._get_feature_names_out) + + def _get_feature_names_out(self, feature_names_in): + if feature_names_in is None: + return None + if self.features_.size < self._n_features: + feature_names_in = feature_names_in[self.features_] + + class_name = self.__class__.__name__.lower() + feature_names_in = np.array([f'{class_name}_{name}' + for name in feature_names_in]) + return feature_names_in def _more_tags(self): return {'allow_nan': True, diff --git a/sklearn/impute/_iterative.py b/sklearn/impute/_iterative.py index 46c0dea06cbcd..42eebe10c403b 100644 --- a/sklearn/impute/_iterative.py +++ b/sklearn/impute/_iterative.py @@ -2,11 +2,13 @@ from time import time from collections import namedtuple import warnings +from functools import partial from scipy import stats import numpy as np from ..base import clone +from .._config import config_context from ..exceptions import ConvergenceWarning from ..preprocessing import normalize from ..utils import (check_array, check_random_state, _safe_indexing, @@ -511,16 +513,18 @@ def _initial_imputation(self, X): missing_values=self.missing_values, strategy=self.initial_strategy ) - X_filled = self.initial_imputer_.fit_transform(X) + with config_context(array_out='default'): + X_filled = self.initial_imputer_.fit_transform(X) else: - X_filled = self.initial_imputer_.transform(X) + with config_context(array_out='default'): + X_filled = self.initial_imputer_.transform(X) valid_mask = np.flatnonzero(np.logical_not( np.isnan(self.initial_imputer_.statistics_))) Xt = X[:, valid_mask] mask_missing_values = mask_missing_values[:, valid_mask] - return Xt, X_filled, mask_missing_values, X_missing_mask + return Xt, X_filled, mask_missing_values, X_missing_mask, valid_mask @staticmethod def _validate_limit(limit, limit_type, n_features): @@ -593,20 +597,26 @@ def fit_transform(self, X, y=None): self.imputation_sequence_ = [] self.initial_imputer_ = None - - X, Xt, mask_missing_values, complete_mask = self._initial_imputation(X) + X_orig = X + self._check_feature_names(X) + X, Xt, mask_missing_values, complete_mask, valid_mask = \ + self._initial_imputation(X) + get_feature_names_out = partial(self._get_feature_names_out, + valid_mask=valid_mask) super()._fit_indicator(complete_mask) X_indicator = super()._transform_indicator(complete_mask) if self.max_iter == 0 or np.all(mask_missing_values): self.n_iter_ = 0 - return super()._concatenate_indicator(Xt, X_indicator) + out = super()._concatenate_indicator(Xt, X_indicator) + return self._make_array_out(out, X_orig, get_feature_names_out) # Edge case: a single feature. We return the initial ... if Xt.shape[1] == 1: self.n_iter_ = 0 - return super()._concatenate_indicator(Xt, X_indicator) + out = super()._concatenate_indicator(Xt, X_indicator) + return self._make_array_out(out, X_orig, get_feature_names_out) self._min_value = self._validate_limit( self.min_value, "min", X.shape[1]) @@ -675,7 +685,8 @@ def fit_transform(self, X, y=None): warnings.warn("[IterativeImputer] Early stopping criterion not" " reached.", ConvergenceWarning) Xt[~mask_missing_values] = X[~mask_missing_values] - return super()._concatenate_indicator(Xt, X_indicator) + out = super()._concatenate_indicator(Xt, X_indicator) + return self._make_array_out(out, X_orig, get_feature_names_out) def transform(self, X): """Imputes all missing values in X. @@ -694,13 +705,25 @@ def transform(self, X): The imputed input data. """ check_is_fitted(self) + X_orig = X + self._check_feature_names(X, reset=False) + X, Xt, mask_missing_values, complete_mask, valid_mask = \ + self._initial_imputation(X) + + def get_feature_names_out(feature_names_in): + imputed_names = feature_names_in[valid_mask] + if self.indicator_ is None: + return imputed_names - X, Xt, mask_missing_values, complete_mask = self._initial_imputation(X) + indicator_names = self.indicator_._get_feature_names_out( + feature_names_in) + return np.r_[imputed_names, indicator_names] X_indicator = super()._transform_indicator(complete_mask) if self.n_iter_ == 0 or np.all(mask_missing_values): - return super()._concatenate_indicator(Xt, X_indicator) + out = super()._concatenate_indicator(Xt, X_indicator) + return self._make_array_out(out, X_orig, get_feature_names_out) imputations_per_round = len(self.imputation_sequence_) // self.n_iter_ i_rnd = 0 @@ -726,7 +749,8 @@ def transform(self, X): Xt[~mask_missing_values] = X[~mask_missing_values] - return super()._concatenate_indicator(Xt, X_indicator) + out = super()._concatenate_indicator(Xt, X_indicator) + return self._make_array_out(out, X_orig, get_feature_names_out) def fit(self, X, y=None): """Fits the imputer on X and return self. diff --git a/sklearn/impute/_knn.py b/sklearn/impute/_knn.py index df66e4a20aff6..09c777cfd2801 100644 --- a/sklearn/impute/_knn.py +++ b/sklearn/impute/_knn.py @@ -1,6 +1,7 @@ # Authors: Ashim Bhattarai # Thomas J Fan # License: BSD 3 clause +from functools import partial import numpy as np @@ -207,8 +208,8 @@ def transform(self, X): The imputed dataset. `n_output_features` is the number of features that is not always missing during `fit`. """ - check_is_fitted(self) + X_orig = X if not is_scalar_nan(self.missing_values): force_all_finite = True else: @@ -224,13 +225,16 @@ def transform(self, X): mask_fit_X = self._mask_fit_X valid_mask = ~np.all(mask_fit_X, axis=0) + get_feature_names_out = partial(self._get_feature_names_out, + valid_mask=valid_mask) X_indicator = super()._transform_indicator(mask) # Removes columns where the training data is all nan if not np.any(mask): # No missing values in X # Remove columns where the training data is all nan - return X[:, valid_mask] + out = X[:, valid_mask] + return self._make_array_out(out, X_orig, get_feature_names_out) row_missing_idx = np.flatnonzero(mask.any(axis=1)) @@ -302,4 +306,5 @@ def process_chunk(dist_chunk, start): # process_chunk modifies X in place. No return value. pass - return super()._concatenate_indicator(X[:, valid_mask], X_indicator) + out = super()._concatenate_indicator(X[:, valid_mask], X_indicator) + return self._make_array_out(out, X_orig, get_feature_names_out) diff --git a/sklearn/impute/tests/test_common.py b/sklearn/impute/tests/test_common.py index 220a335c15285..d44344b2ce50a 100644 --- a/sklearn/impute/tests/test_common.py +++ b/sklearn/impute/tests/test_common.py @@ -3,6 +3,7 @@ import numpy as np from scipy import sparse +from sklearn import config_context from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_allclose_dense_sparse from sklearn.utils._testing import assert_array_equal @@ -113,3 +114,30 @@ def test_imputers_pandas_na_integer_array_support(imputer, add_indicator): X_trans = imputer.fit_transform(X_df) assert_allclose(X_trans_expected, X_trans) + + +# ConvergenceWarning will be raised by the IterativeImputer +@pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning") +@pytest.mark.parametrize("imputer", IMPUTERS, + ids=lambda x: x.__class__.__name__) +def test_imputer_array_out_indicator(imputer): + pd = pytest.importorskip("pandas") + + X = np.array([ + [np.nan, 1, 5, np.nan, 1], + [2, np.nan, 1, np.nan, 2], + [6, 3, np.nan, np.nan, 3], + [1, 2, 9, np.nan, 4] + ]) + names = np.array([f"feat_{i}" for i in range(X.shape[1])]) + X = pd.DataFrame(X, columns=names) + imputer.set_params(missing_values=np.nan, add_indicator=True) + + with config_context(array_out='pandas'): + X_trans = imputer.fit_transform(X) + + assert isinstance(X_trans, pd.DataFrame) + feature_names_out = names[[0, 1, 2, 4]] + out_names = np.r_[feature_names_out, + [f"missingindicator_{name}" for name in names[:4]]] + assert_array_equal(X_trans.columns, out_names) diff --git a/sklearn/kernel_approximation.py b/sklearn/kernel_approximation.py index e03d8ccaafee2..82573419d1bf9 100644 --- a/sklearn/kernel_approximation.py +++ b/sklearn/kernel_approximation.py @@ -149,7 +149,8 @@ def transform(self, X): """ check_is_fitted(self) - X = self._validate_data(X, accept_sparse="csc") + X_orig = X + X = self._validate_data(X, accept_sparse="csc", reset=False) X_gamma = np.sqrt(self.gamma) * X @@ -191,7 +192,7 @@ def transform(self, X): count_sketches_fft_prod = np.prod(count_sketches_fft, axis=1) data_sketch = np.real(ifft(count_sketches_fft_prod, overwrite_x=True)) - return data_sketch + return self._make_array_out(data_sketch, X_orig, 'class_name') class RBFSampler(TransformerMixin, BaseEstimator): @@ -301,13 +302,13 @@ def transform(self, X): X_new : array-like, shape (n_samples, n_components) """ check_is_fitted(self) - + X_orig = X X = check_array(X, accept_sparse='csr') projection = safe_sparse_dot(X, self.random_weights_) projection += self.random_offset_ np.cos(projection, projection) projection *= np.sqrt(2.) / np.sqrt(self.n_components) - return projection + return self._make_array_out(projection, X_orig, 'class_name') class SkewedChi2Sampler(TransformerMixin, BaseEstimator): @@ -418,7 +419,7 @@ def transform(self, X): X_new : array-like, shape (n_samples, n_components) """ check_is_fitted(self) - + X_orig = X X = as_float_array(X, copy=True) X = check_array(X, copy=False) if (X <= -self.skewedness).any(): @@ -431,7 +432,7 @@ def transform(self, X): projection += self.random_offset_ np.cos(projection, projection) projection *= np.sqrt(2.) / np.sqrt(self.n_components) - return projection + return self._make_array_out(projection, X_orig, 'class_name') class AdditiveChi2Sampler(TransformerMixin, BaseEstimator): @@ -554,6 +555,7 @@ def transform(self, X): msg = ("%(name)s is not fitted. Call fit to set the parameters before" " calling transform") check_is_fitted(self, msg=msg) + X_orig = X X = check_array(X, accept_sparse='csr') check_non_negative(X, 'X in AdditiveChi2Sampler.transform') @@ -564,7 +566,9 @@ def transform(self, X): # cosh(0) = 1.0 transf = self._transform_sparse if sparse else self._transform_dense - return transf(X) + output = transf(X) + + return self._make_array_out(output, X_orig, 'class_name') def _transform_dense(self, X): non_zero = (X != 0.0) @@ -787,6 +791,7 @@ def transform(self, X): Transformed data. """ check_is_fitted(self) + X_orig = X X = check_array(X, accept_sparse='csr') kernel_params = self._get_kernel_params() @@ -794,7 +799,8 @@ def transform(self, X): metric=self.kernel, filter_params=True, **kernel_params) - return np.dot(embedded, self.normalization_.T) + out = np.dot(embedded, self.normalization_.T) + return self._make_array_out(out, X_orig, 'class_name') def _get_kernel_params(self): params = self.kernel_params diff --git a/sklearn/manifold/_isomap.py b/sklearn/manifold/_isomap.py index d843c3ddd8462..2966edddda2bb 100644 --- a/sklearn/manifold/_isomap.py +++ b/sklearn/manifold/_isomap.py @@ -148,6 +148,7 @@ def _fit_transform(self, X): n_jobs=self.n_jobs) self.nbrs_.fit(X) self.n_features_in_ = self.nbrs_.n_features_in_ + self.feature_names_in_ = self.nbrs_.feature_names_in_ self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", @@ -227,7 +228,7 @@ def fit_transform(self, X, y=None): X_new : array-like, shape (n_samples, n_components) """ self._fit_transform(X) - return self.embedding_ + return self._make_array_out(self.embedding_, X, 'class_name') def transform(self, X): """Transform X. @@ -269,4 +270,5 @@ def transform(self, X): G_X **= 2 G_X *= -0.5 - return self.kernel_pca_.transform(G_X) + return self._make_array_out(self.kernel_pca_.transform(G_X), + X, 'class_name') diff --git a/sklearn/manifold/_locally_linear.py b/sklearn/manifold/_locally_linear.py index a2d3e63060413..776a268227c1c 100644 --- a/sklearn/manifold/_locally_linear.py +++ b/sklearn/manifold/_locally_linear.py @@ -704,7 +704,7 @@ def fit_transform(self, X, y=None): X_new : array-like, shape (n_samples, n_components) """ self._fit_transform(X) - return self.embedding_ + return self._make_array_out(self.embedding_, X, 'class_name') def transform(self, X): """ @@ -724,7 +724,7 @@ def transform(self, X): it together with methods that are not scale-invariant (like SVMs) """ check_is_fitted(self) - + X_orig = X X = check_array(X) ind = self.nbrs_.kneighbors(X, n_neighbors=self.n_neighbors, return_distance=False) @@ -732,4 +732,4 @@ def transform(self, X): X_new = np.empty((X.shape[0], self.n_components)) for i in range(X.shape[0]): X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i]) - return X_new + return self._make_array_out(X_new, X_orig, 'class_name') diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py index e81c2b98dafef..ee98bf2470093 100644 --- a/sklearn/neighbors/_graph.py +++ b/sklearn/neighbors/_graph.py @@ -344,8 +344,9 @@ def transform(self, X): """ check_is_fitted(self) add_one = self.mode == 'distance' - return self.kneighbors_graph(X, mode=self.mode, - n_neighbors=self.n_neighbors + add_one) + out = self.kneighbors_graph(X, mode=self.mode, + n_neighbors=self.n_neighbors + add_one) + return self._make_array_out(out, X, 'class_name') def fit_transform(self, X, y=None): """Fit to data, then transform it. @@ -518,8 +519,9 @@ def transform(self, X): The matrix is of CSR format. """ check_is_fitted(self) - return self.radius_neighbors_graph(X, mode=self.mode, - sort_results=True) + out = self.radius_neighbors_graph(X, mode=self.mode, + sort_results=True) + return self._make_array_out(out, X, 'class_name') def fit_transform(self, X, y=None): """Fit to data, then transform it. diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py index 8920b2d99ed02..011eb55ed9f0f 100644 --- a/sklearn/neighbors/_nca.py +++ b/sklearn/neighbors/_nca.py @@ -263,9 +263,11 @@ def transform(self, X): """ check_is_fitted(self) + X_orig = X X = check_array(X) - return np.dot(X, self.components_.T) + return self._make_array_out(np.dot(X, self.components_.T), X_orig, + 'class_name') def _validate_params(self, X, y): """Validate parameters as soon as :meth:`fit` is called. diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py index d1028911f4185..a4a712acc6e79 100644 --- a/sklearn/neural_network/_rbm.py +++ b/sklearn/neural_network/_rbm.py @@ -130,9 +130,11 @@ def transform(self, X): Latent representations of the data. """ check_is_fitted(self) + X_orig = X X = check_array(X, accept_sparse='csr', dtype=(np.float64, np.float32)) - return self._mean_hiddens(X) + out = self._mean_hiddens(X) + return self._make_array_out(out, X_orig, 'class_name') def _mean_hiddens(self, v): """Computes the probabilities P(h=1|v). diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index a876370bc44af..f37e7d2888787 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -16,6 +16,7 @@ from scipy import sparse from joblib import Parallel, delayed +from ._config import get_config from .base import clone, TransformerMixin from .utils._estimator_html_repr import _VisualBlock from .utils.metaestimators import if_delegate_has_method @@ -24,6 +25,8 @@ from .utils.validation import _deprecate_positional_args from .utils.metaestimators import _BaseComposition +from ._config import config_context +from .utils._array_out import _get_feature_names __all__ = ['Pipeline', 'FeatureUnion', 'make_pipeline', 'make_union'] @@ -714,12 +717,16 @@ def make_pipeline(*steps, memory=None, verbose=False): return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose) -def _transform_one(transformer, X, y, weight, **fit_params): - res = transformer.transform(X) - # if we have a weight for this transformer, multiply output - if weight is None: - return res - return res * weight +def _transform_one(transformer, X, y, weight, config=None, **fit_params): + if config is None: + config = {} + + with config_context(**config): + res = transformer.transform(X) + # if we have a weight for this transformer, multiply output + if weight is None: + return res + return res * weight def _fit_transform_one(transformer, @@ -728,21 +735,26 @@ def _fit_transform_one(transformer, weight, message_clsname='', message=None, + config=None, **fit_params): """ Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned with the fitted transformer. If ``weight`` is not ``None``, the result will be multiplied by ``weight``. """ - with _print_elapsed_time(message_clsname, message): - if hasattr(transformer, 'fit_transform'): - res = transformer.fit_transform(X, y, **fit_params) - else: - res = transformer.fit(X, y, **fit_params).transform(X) + if config is None: + config = {} + + with config_context(**config): + with _print_elapsed_time(message_clsname, message): + if hasattr(transformer, 'fit_transform'): + res = transformer.fit_transform(X, y, **fit_params) + else: + res = transformer.fit(X, y, **fit_params).transform(X) - if weight is None: - return res, transformer - return res * weight, transformer + if weight is None: + return res, transformer + return res * weight, transformer def _fit_one(transformer, @@ -751,12 +763,16 @@ def _fit_one(transformer, weight, message_clsname='', message=None, + config=None, **fit_params): """ Fits ``transformer`` to ``X`` and ``y``. """ - with _print_elapsed_time(message_clsname, message): - return transformer.fit(X, y, **fit_params) + if config is None: + config = {} + with config_context(**config): + with _print_elapsed_time(message_clsname, message): + return transformer.fit(X, y, **fit_params) class FeatureUnion(TransformerMixin, _BaseComposition): @@ -987,6 +1003,7 @@ def _parallel_func(self, X, y, fit_params, func): return Parallel(n_jobs=self.n_jobs)(delayed(func)( transformer, X, y, weight, message_clsname='FeatureUnion', + config=get_config(), message=self._log_message(name, idx, len(transformers)), **fit_params) for idx, (name, transformer, weight) in enumerate(transformers, 1)) @@ -1006,14 +1023,35 @@ def transform(self, X): hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ + X_orig = X Xs = Parallel(n_jobs=self.n_jobs)( - delayed(_transform_one)(trans, X, None, weight) + delayed(_transform_one)(trans, X, None, weight, + get_config()) for name, trans, weight in self._iter()) if not Xs: # All transformers are None - return np.zeros((X.shape[0], 0)) - - return self._hstack(Xs) + return self._make_array_out(np.zeros((X.shape[0], 0)), + Xs, X_orig) + + out = self._hstack(Xs) + return self._make_array_out(out, Xs, X_orig) + + def _make_array_out(self, X_out, Xs, X_orig): + def get_feature_names_out(): + transformer_names = (name for name, _ in self.transformer_list) + feature_names = [] + for X, trans_name in zip(Xs, transformer_names): + inner_names = _get_feature_names(X) + inner_names = [f'{trans_name}_{name}' for name in inner_names] + feature_names.append(inner_names) + + feature_names_out = np.concatenate(feature_names) + if feature_names_out.size != X_out.shape[1]: + return None + return feature_names_out + + return super()._make_array_out(X_out, X_orig, + get_feature_names_out) def _hstack(self, Xs): if any(sparse.issparse(f) for f in Xs): diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 3bf9eeb7f3741..923da629ad268 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -429,7 +429,7 @@ def transform(self, X): Transformed data. """ check_is_fitted(self) - + X_orig = X X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan") @@ -437,7 +437,7 @@ def transform(self, X): X += self.min_ if self.clip: np.clip(X, self.feature_range[0], self.feature_range[1], out=X) - return X + return self._make_array_out(X, X_orig, 'one_to_one') def inverse_transform(self, X): """Undo the scaling of X according to feature_range. @@ -842,6 +842,7 @@ def transform(self, X, copy=None): Transformed array. """ check_is_fitted(self) + X_orig = X copy = copy if copy is not None else self.copy X = self._validate_data(X, reset=False, @@ -861,7 +862,7 @@ def transform(self, X, copy=None): X -= self.mean_ if self.with_std: X /= self.scale_ - return X + return self._make_array_out(X, X_orig, 'one_to_one') def inverse_transform(self, X, copy=None): """Scale back the data to the original representation @@ -1064,6 +1065,7 @@ def transform(self, X): Transformed array. """ check_is_fitted(self) + X_orig = X X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -1072,7 +1074,7 @@ def transform(self, X): inplace_column_scale(X, 1.0 / self.scale_) else: X /= self.scale_ - return X + return self._make_array_out(X, X_orig, 'one_to_one') def inverse_transform(self, X): """Scale back the data to the original representation @@ -1365,6 +1367,7 @@ def transform(self, X): Transformed array. """ check_is_fitted(self) + X_orig = X X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, estimator=self, dtype=FLOAT_DTYPES, force_all_finite='allow-nan') @@ -1377,7 +1380,7 @@ def transform(self, X): X -= self.center_ if self.with_scaling: X /= self.scale_ - return X + return self._make_array_out(X, X_orig, 'one_to_one') def inverse_transform(self, X): """Scale back the data to the original representation @@ -1702,6 +1705,7 @@ def transform(self, X): """ check_is_fitted(self) + X_orig = X X = check_array(X, order='F', dtype=FLOAT_DTYPES, accept_sparse=('csr', 'csc')) @@ -1798,7 +1802,7 @@ def transform(self, X): new_index.append(current_col) index = new_index - return XP + return self._make_array_out(XP, X_orig, self.get_feature_names) @_deprecate_positional_args @@ -2004,9 +2008,11 @@ def transform(self, X, copy=None): X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) Transformed array. """ + X_orig = X copy = copy if copy is not None else self.copy X = check_array(X, accept_sparse='csr') - return normalize(X, norm=self.norm, axis=1, copy=copy) + output = normalize(X, norm=self.norm, axis=1, copy=copy) + return self._make_array_out(output, X_orig, 'one_to_one') def _more_tags(self): return {'stateless': True} @@ -2162,7 +2168,9 @@ def transform(self, X, copy=None): Transformed array. """ copy = copy if copy is not None else self.copy - return binarize(X, threshold=self.threshold, copy=copy) + X_orig = X + out = binarize(X, threshold=self.threshold, copy=copy) + return self._make_array_out(out, X_orig, 'one_to_one') def _more_tags(self): return {'stateless': True} @@ -2257,7 +2265,7 @@ def transform(self, K, copy=True): K_new : ndarray of shape (n_samples1, n_samples2) """ check_is_fitted(self) - + K_orig = K K = check_array(K, copy=copy, dtype=FLOAT_DTYPES) K_pred_cols = (np.sum(K, axis=1) / @@ -2267,7 +2275,7 @@ def transform(self, K, copy=True): K -= K_pred_cols K += self.K_fit_all_ - return K + return self._make_array_out(K, K_orig, 'one_to_one') @property def _pairwise(self): @@ -2555,7 +2563,11 @@ def fit(self, X, y=None): " and {} samples.".format(self.n_quantiles, self.subsample)) - X = self._check_inputs(X, in_fit=True, copy=False) + X = self._validate_data(X, reset=True, + accept_sparse='csc', copy=False, + dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') + self._check_input(X) n_samples = X.shape[0] if self.n_quantiles > n_samples: @@ -2646,22 +2658,9 @@ def _transform_col(self, X_col, quantiles, inverse): return X_col - def _check_inputs(self, X, in_fit, accept_sparse_negative=False, - copy=False): - """Check inputs before fit and transform.""" - # In theory reset should be equal to `in_fit`, but there are tests - # checking the input number of feature and they expect a specific - # string, which is not the same one raised by check_n_features. So we - # don't check n_features_in_ here for now (it's done with adhoc code in - # the estimator anyway). - # TODO: set reset=in_fit when addressing reset in - # predict/transform/etc. - reset = True - - X = self._validate_data(X, reset=reset, - accept_sparse='csc', copy=copy, - dtype=FLOAT_DTYPES, - force_all_finite='allow-nan') + def _check_input(self, X, accept_sparse_negative=False, + check_quantiles=False): + """Check inputs before fit or transform.""" # we only accept positive sparse matrix when ignore_implicit_zeros is # false and that we call fit or transform. with np.errstate(invalid='ignore'): # hide NaN comparison warnings @@ -2676,13 +2675,8 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False, " or 'uniform'. Got '{}' instead.".format( self.output_distribution)) - return X - - def _check_is_fitted(self, X): - """Check the inputs before transforming.""" - check_is_fitted(self) # check that the dimension of X are adequate with the fitted data - if X.shape[1] != self.quantiles_.shape[1]: + if check_quantiles and X.shape[1] != self.quantiles_.shape[1]: raise ValueError('X does not have the same number of features as' ' the previously fitted data. Got {} instead of' ' {}.'.format(X.shape[1], @@ -2737,10 +2731,12 @@ def transform(self, X): Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) The projected data. """ - X = self._check_inputs(X, in_fit=False, copy=self.copy) - self._check_is_fitted(X) - - return self._transform(X, inverse=False) + X_orig = X + X = check_array(X, accept_sparse='csc', copy=self.copy, + dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + self._check_input(X, check_quantiles=True) + output = self._transform(X, inverse=False) + return self._make_array_out(output, X_orig, 'one_to_one') def inverse_transform(self, X): """Back-projection to the original space. @@ -2758,9 +2754,11 @@ def inverse_transform(self, X): Xt : {ndarray, sparse matrix} of (n_samples, n_features) The projected data. """ - X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True, - copy=self.copy) - self._check_is_fitted(X) + check_is_fitted(self) + X = check_array(X, accept_sparse='csc', copy=self.copy, + dtype=FLOAT_DTYPES, force_all_finite='allow-nan') + self._check_input(X, accept_sparse_negative=True, + check_quantiles=True) return self._transform(X, inverse=True) @@ -3016,11 +3014,14 @@ def fit(self, X, y=None): return self def fit_transform(self, X, y=None): - return self._fit(X, y, force_transform=True) + output = self._fit(X, y, force_transform=True) + return self._make_array_out(output, X, 'one_to_one') def _fit(self, X, y=None, force_transform=False): - X = self._check_input(X, in_fit=True, check_positive=True, - check_method=True) + X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, + copy=self.copy, force_all_finite='allow-nan', + reset=True) + self._check_input(X, check_positive=True, check_method=True) if not self.copy and not force_transform: # if call from fit() X = X.copy() # force copy so that fit does not change X inplace @@ -3062,8 +3063,10 @@ def transform(self, X): The transformed data. """ check_is_fitted(self) - X = self._check_input(X, in_fit=False, check_positive=True, - check_shape=True) + X_orig = X + X = check_array(X, ensure_2d=True, dtype=FLOAT_DTYPES, + copy=self.copy, force_all_finite='allow-nan') + self._check_input(X, check_positive=True, check_shape=True) transform_function = {'box-cox': boxcox, 'yeo-johnson': self._yeo_johnson_transform @@ -3075,7 +3078,7 @@ def transform(self, X): if self.standardize: X = self._scaler.transform(X) - return X + return self._make_array_out(X, X_orig, 'one_to_one') def inverse_transform(self, X): """Apply the inverse power transformation using the fitted lambdas. @@ -3109,7 +3112,9 @@ def inverse_transform(self, X): The original data. """ check_is_fitted(self) - X = self._check_input(X, in_fit=False, check_shape=True) + X = check_array(X, ensure_2d=True, dtype=FLOAT_DTYPES, + copy=self.copy, force_all_finite='allow-nan') + self._check_input(X, check_shape=True) if self.standardize: X = self._scaler.inverse_transform(X) @@ -3214,9 +3219,9 @@ def _neg_log_likelihood(lmbda): # choosing bracket -2, 2 like for boxcox return optimize.brent(_neg_log_likelihood, brack=(-2, 2)) - def _check_input(self, X, in_fit, check_positive=False, check_shape=False, + def _check_input(self, X, check_positive=False, check_shape=False, check_method=False): - """Validate the input before fit and transform. + """Validate the input before fit or transform. Parameters ---------- @@ -3232,9 +3237,6 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False, check_method : bool, default=False If True, check that the transformation method is valid. """ - X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES, - copy=self.copy, force_all_finite='allow-nan') - with np.warnings.catch_warnings(): np.warnings.filterwarnings( 'ignore', r'All-NaN (slice|axis) encountered') @@ -3254,8 +3256,6 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False, "got {} instead." .format(valid_methods, self.method)) - return X - def _more_tags(self): return {'allow_nan': True} diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py index d0725cb6318e5..07ab2af32021a 100644 --- a/sklearn/preprocessing/_discretization.py +++ b/sklearn/preprocessing/_discretization.py @@ -12,6 +12,7 @@ from . import OneHotEncoder +from .._config import config_context from ..base import BaseEstimator, TransformerMixin from ..utils.validation import check_array from ..utils.validation import check_is_fitted @@ -309,18 +310,20 @@ def transform(self, X): np.clip(Xt, 0, self.n_bins_ - 1, out=Xt) if self.encode == 'ordinal': - return Xt + return self._make_array_out(Xt, X, 'one_to_one') dtype_init = None if 'onehot' in self.encode: dtype_init = self._encoder.dtype self._encoder.dtype = Xt.dtype try: - Xt_enc = self._encoder.transform(Xt) + with config_context(array_out='default'): + Xt_enc = self._encoder.transform(Xt) finally: # revert the initial dtype to avoid modifying self. self._encoder.dtype = dtype_init - return Xt_enc + + return self._make_array_out(Xt_enc, X, self._encoder.get_feature_names) def inverse_transform(self, Xt): """ diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 7c3d9f0af275a..adb20c1253270 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -72,6 +72,7 @@ def _get_feature(self, X, feature_idx): return X[:, feature_idx] def _fit(self, X, handle_unknown='error'): + self._check_feature_names(X) X_list, n_samples, n_features = self._check_X(X) if self.categories != 'auto': @@ -424,6 +425,7 @@ def transform(self, X): Transformed input. """ check_is_fitted(self) + X_orig = X # validation of X happens in _check_X called by _transform X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) @@ -466,9 +468,9 @@ def transform(self, X): shape=(n_samples, feature_indices[-1]), dtype=self.dtype) if not self.sparse: - return out.toarray() - else: - return out + out = out.toarray() + + return self._make_array_out(out, X_orig, self.get_feature_names) def inverse_transform(self, X): """ @@ -740,7 +742,8 @@ def transform(self, X): # create separate category for unknown values if self.handle_unknown == 'use_encoded_value': X_int[~X_mask] = self.unknown_value - return X_int.astype(self.dtype, copy=False) + out = X_int.astype(self.dtype, copy=False) + return self._make_array_out(out, X, 'one_to_one') def inverse_transform(self, X): """ diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index ca176aeb87a10..63b8ef9886bb6 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -125,6 +125,7 @@ def fit(self, X, y=None): ------- self """ + self._check_feature_names(X) X = self._check_input(X) if (self.check_inverse and not (self.func is None or self.inverse_func is None)): @@ -144,7 +145,12 @@ def transform(self, X): X_out : array-like, shape (n_samples, n_features) Transformed input. """ - return self._transform(X, func=self.func, kw_args=self.kw_args) + out = self._transform(X, func=self.func, kw_args=self.kw_args) + + def get_feature_names_out(): + if hasattr(self, "feature_names_in_"): + return self.feature_names_in_ + return self._make_array_out(out, X, get_feature_names_out) def inverse_transform(self, X): """Transform X using the inverse function. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index c09acf1591532..17d540c646b77 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -57,6 +57,7 @@ from sklearn.utils import shuffle from sklearn import datasets +from sklearn import config_context iris = datasets.load_iris() @@ -2542,3 +2543,63 @@ def test_minmax_scaler_clip(feature_range): X_transformed, [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]]) + + +@pytest.mark.parametrize('Transformer', [ + MaxAbsScaler, MinMaxScaler, Normalizer, PowerTransformer, + QuantileTransformer, RobustScaler, StandardScaler +]) +def test_one_to_one_feature_mapping_array_out(Transformer): + pd = pytest.importorskip("pandas") + n_samples, n_features = 1000, 10 + feature_names = [f'feat_{i}' for i in range(n_features)] + + rng = np.random.RandomState(42) + X = rng.randn(n_samples, n_features) + df = pd.DataFrame(X, columns=feature_names) + + with config_context(array_out='pandas'): + df_trans = Transformer().fit_transform(df) + + assert_array_equal(df_trans.columns, df.columns) + + +@pytest.mark.parametrize('transformer', [ + StandardScaler(with_mean=False), + RobustScaler(with_centering=False), + MaxAbsScaler(), + Normalizer(), + QuantileTransformer()]) +def test_one_to_one_feature_mapping_sparse_array_out(transformer): + pd = pytest.importorskip("pandas") + n_samples, n_features = 1000, 10 + feature_names = [f'feat_{i}' for i in range(n_features)] + + X = sparse_random(n_samples, n_features, random_state=42) + df = pd.DataFrame.sparse.from_spmatrix(X, columns=feature_names) + + with config_context(array_out='pandas'): + df_trans = transformer.fit_transform(df) + + assert_array_equal(df_trans.columns, df.columns) + + +@pytest.mark.parametrize("is_sparse", [True, False]) +def test_polynomial_feature_names_pandas_array_out(is_sparse): + pd = pytest.importorskip("pandas") + X = np.arange(30).reshape(10, 3) + feature_names = ["a", "b", "c"] + + if is_sparse: + X_sp = sparse.csr_matrix(X) + df = pd.DataFrame.sparse.from_spmatrix(X_sp, columns=feature_names) + else: + df = pd.DataFrame(X, columns=feature_names) + + poly = PolynomialFeatures(degree=2, include_bias=True).fit(df) + feature_names = poly.get_feature_names(feature_names) + + with config_context(array_out='pandas'): + df_trans = poly.transform(df) + + assert_array_equal(df_trans.columns, feature_names) diff --git a/sklearn/random_projection.py b/sklearn/random_projection.py index 4623ac1ab64e4..fefaa813c1ea4 100644 --- a/sklearn/random_projection.py +++ b/sklearn/random_projection.py @@ -402,6 +402,7 @@ def transform(self, X): X_new : {ndarray, sparse matrix} of shape (n_samples, n_components) Projected array. """ + X_orig = X X = check_array(X, accept_sparse=['csr', 'csc']) check_is_fitted(self) @@ -414,7 +415,7 @@ def transform(self, X): X_new = safe_sparse_dot(X, self.components_.T, dense_output=self.dense_output) - return X_new + return self._make_array_out(X_new, X_orig, 'class_name') class GaussianRandomProjection(BaseRandomProjection): diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index c41bdb1116a6c..bc23537d8f003 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -37,7 +37,9 @@ _set_checking_parameters, _get_check_estimator_ids, check_class_weight_balanced_linear_classifier, - parametrize_with_checks) + parametrize_with_checks, + check_array_out_pandas, + check_array_out_xarray) def test_all_estimator_no_base_class(): @@ -268,3 +270,26 @@ def test_strict_mode_check_estimator(): def test_strict_mode_parametrize_with_checks(estimator, check): # Ideally we should assert that the strict checks are Xfailed... check(estimator) + + +def all_transformers_2d(): + for name, Estimator in all_estimators(type_filter="transformer"): + try: + estimator = _construct_instance(Estimator) + except SkipTest: + continue + + tags = estimator._get_tags() + if "2darray" in tags["X_types"]: + _set_checking_parameters(estimator) + yield name, estimator + + +@pytest.mark.parametrize("name, estimator", all_transformers_2d()) +def test_array_out_pandas(name, estimator): + check_array_out_pandas(name, estimator) + + +@pytest.mark.parametrize("name, estimator", all_transformers_2d()) +def test_array_out_xarray(name, estimator): + check_array_out_xarray(name, estimator) diff --git a/sklearn/tests/test_config.py b/sklearn/tests/test_config.py index eec349861258c..3dcf1fc514752 100644 --- a/sklearn/tests/test_config.py +++ b/sklearn/tests/test_config.py @@ -5,7 +5,8 @@ def test_config_context(): assert get_config() == {'assume_finite': False, 'working_memory': 1024, 'print_changed_only': True, - 'display': 'text'} + 'display': 'text', + 'array_out': 'default'} # Not using as a context manager affects nothing config_context(assume_finite=True) @@ -14,7 +15,8 @@ def test_config_context(): with config_context(assume_finite=True): assert get_config() == {'assume_finite': True, 'working_memory': 1024, 'print_changed_only': True, - 'display': 'text'} + 'display': 'text', + 'array_out': 'default'} assert get_config()['assume_finite'] is False with config_context(assume_finite=True): @@ -40,7 +42,8 @@ def test_config_context(): assert get_config() == {'assume_finite': False, 'working_memory': 1024, 'print_changed_only': True, - 'display': 'text'} + 'display': 'text', + 'array_out': 'default'} # No positional arguments assert_raises(TypeError, config_context, True) diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 966cce49eaf42..d04bf0e658c19 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -223,6 +223,7 @@ def test_fit_docstring_attributes(name, Estimator): est.fit(X, y) skipped_attributes = {'n_features_in_', + 'feature_names_in_', 'x_scores_', # For PLS, TODO remove in 0.26 'y_scores_'} # For PLS, TODO remove in 0.26 diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 6847ffe418cb4..7bff906d440a3 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -12,6 +12,7 @@ from scipy import sparse import joblib +from sklearn import config_context from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_raises_regex from sklearn.utils._testing import assert_raise_message @@ -31,7 +32,7 @@ from sklearn.feature_selection import SelectKBest, f_classif from sklearn.dummy import DummyRegressor from sklearn.decomposition import PCA, TruncatedSVD -from sklearn.datasets import load_iris +from sklearn.datasets import load_iris, make_classification from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction.text import CountVectorizer from sklearn.experimental import enable_hist_gradient_boosting # noqa @@ -1247,3 +1248,29 @@ def test_feature_union_warns_unknown_transformer_weight(): union = FeatureUnion(transformer_list, transformer_weights=weights) with pytest.raises(ValueError, match=expected_msg): union.fit(X, y) + + +@pytest.mark.parametrize("n_jobs", [1, 2]) +def test_feature_union_array_out_pandas(n_jobs): + pd = pytest.importorskip('pandas') + X, y = make_classification(random_state=42) + column_names = [f'col{i}' for i in range(X.shape[1])] + X = pd.DataFrame(X, columns=column_names) + svd = TruncatedSVD(n_components=2, random_state=0) + select = SelectKBest(k=1) + fs = FeatureUnion([("svd", svd), ("select", select)], n_jobs=n_jobs) + fs.fit(X, y) + + with config_context(array_out='pandas'): + df_out = fs.transform(X) + + assert isinstance(df_out, pd.DataFrame) + + with config_context(array_out='pandas'): + df_svd_out = fs.transformer_list[0][1].transform(X) + df_select_out = fs.transformer_list[1][1].transform(X) + + expected_feature_names = ( + [f"svd_{name}" for name in df_svd_out.columns] + + [f"select_{name}" for name in df_select_out.columns]) + assert_array_equal(df_out.columns, expected_feature_names) diff --git a/sklearn/utils/_array_out.py b/sklearn/utils/_array_out.py new file mode 100644 index 0000000000000..827c103d1e1ee --- /dev/null +++ b/sklearn/utils/_array_out.py @@ -0,0 +1,72 @@ +import numpy as np +import scipy.sparse as sp_sparse +from .._config import get_config + + +def _get_feature_names(X): + """Get feature names of a dataframe or a dataarray.""" + if hasattr(X, "columns"): + # pandas + return np.array(X.columns, dtype=object) + elif hasattr(X, "dims") and isinstance(X.dims, tuple) and len(X.dims) == 2: + # xarray DataArray + return np.array(X.coords[X.dims[1]], dtype=object) + + +def _make_array_out(X_out, X_orig, get_feature_names_out): + """Construct array container based on global configuration. + + Parameters + ---------- + X_out: {ndarray, sparse matrix} of shape (n_samples, n_features_out) + Output data to be wrapped. + + X_orig: array-like of shape (n_samples, n_features) + Original input data. For panda's DataFrames, this is used to get + the index. For xarray's DataArrays, this is used to get the name + of the dims and the coordinates for the first dims. + + get_features_names_out: callable + Returns the feature names out. If the callable returns None, then + the feature names will be ["X0", "X1", ...]. + + Return + ------ + array_out: {ndarray, sparse matrix, dataframe, dataarray} of shape \ + (n_samples, n_features_out) + Wrapped array with feature names. + """ + array_out = get_config()['array_out'] + if array_out not in ('default', 'pandas', 'xarray'): + raise ValueError("array_out must be 'default', 'pandas' or 'xarray'") + if array_out == 'default': + return X_out + + feature_names_out = get_feature_names_out() + if feature_names_out is None: + feature_names_out = [f'X{i}' for i in range(X_out.shape[1])] + + if array_out == 'pandas': + import pandas as pd + if sp_sparse.issparse(X_out): + make_dataframe = pd.DataFrame.sparse.from_spmatrix + else: + make_dataframe = pd.DataFrame + + return make_dataframe(X_out, columns=feature_names_out, + index=getattr(X_orig, "index", None)) + else: + # xarray + import xarray as xr + dims = getattr(X_orig, "dims", ("index", "columns")) + + coords = {dims[1]: feature_names_out} + if hasattr(X_orig, "coords") and dims[0] in X_orig.coords: + coords[dims[0]] = X_orig.coords[dims[0]] + + if sp_sparse.issparse(X_out): + # pydata/sparse + import sparse as pydata_sparse + X_out = pydata_sparse.COO.from_scipy_sparse(X_out) + + return xr.DataArray(X_out, dims=dims, coords=coords) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 71a84537aabbf..b7e49138b50cb 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -7,6 +7,7 @@ from copy import deepcopy from functools import partial, wraps from inspect import signature +from itertools import product import numpy as np from scipy import sparse @@ -3093,3 +3094,96 @@ def check_requires_y_none(name, estimator_orig, strict_mode=True): _FULLY_STRICT_CHECKS = set([ 'check_n_features_in', ]) + + +def _check_array_out_transformers(name, estimator_orig, array_out, + make_array, check_array_out): + rng = np.random.RandomState(0) + X_train_np = 3 * rng.uniform(size=(20, 5)) + X_train_np = _pairwise_estimator_convert_X(X_train_np, estimator_orig) + X_train_np = _enforce_estimator_tags_x(estimator_orig, X_train_np) + y = X_train_np[:, 0].astype(int) + y = _enforce_estimator_tags_y(estimator_orig, y) + + index = np.arange(X_train_np.shape[0]) + rng.shuffle(index) + column_names = [f"feature_{i}" for i in range(X_train_np.shape[1])] + X_train_array = make_array(X_train_np, column_names, index) + + estimator = clone(estimator_orig) + + # array_out will always output output_dtype regardless of the input + # types during training of transforming + X_train_product = product([X_train_np, X_train_array], + [X_train_np, X_train_array]) + for X_train_input, X_transform_input in X_train_product: + estimator.fit(X_train_input, y) + with config_context(array_out=array_out): + output = estimator.transform(X_transform_input) + check_array_out(output, X_transform_input) + + # Using fit_transform with array_out always results in an data container + # that is array_out + for X_train_input in [X_train_np, X_train_array]: + with config_context(array_out=array_out): + output = estimator.fit_transform(X_transform_input, y) + + # for cross_decomposition + if isinstance(output, tuple): + output = output[0] + + check_array_out(output, X_transform_input) + + # feature names are out of order + X_test_bad = make_array(X_train_np, column_names[::-1], index) + msg = ("The feature names of X does not match the feature_names_in_ " + "attribute") + with config_context(array_out=array_out): + assert_raise_message(ValueError, msg, estimator.transform, X_test_bad) + + +def check_array_out_pandas(name, estimator_orig, strict_mode=True): + try: + import pandas as pd + except ImportError: + raise SkipTest("pandas is not installed: not testing for " + "array_out with pandas") + + def make_array(array, column_names, index): + return pd.DataFrame(array, columns=column_names, + index=index) + + def check_array_out(output, X_transform_input): + assert isinstance(output, pd.DataFrame) + # make sure the indices are the same if X_transform_input is a + # dataframe + if isinstance(X_transform_input, pd.DataFrame): + assert_array_equal(output.index, X_transform_input.index) + + _check_array_out_transformers(name, estimator_orig, "pandas", + make_array, check_array_out) + + +def check_array_out_xarray(name, estimator_orig, strict_mode=True): + try: + import xarray as xr + except ImportError: + raise SkipTest("xarray is not installed: not testing for " + "array_out with xarray") + + def make_array(array, column_names, index): + return xr.DataArray(array, dims=("index", "columns"), + coords={"index": index, "columns": column_names}) + + def check_array_out(output, X_transform_input): + assert isinstance(output, xr.DataArray) + # make sure the indices are the same if X_transform_input is a + # dataframe + if isinstance(X_transform_input, xr.DataArray): + dims = X_transform_input.dims + X_transform_input_index = X_transform_input.coords[dims[0]] + output_index = output.coords[dims[0]] + assert_array_equal(X_transform_input_index, output_index) + + _check_array_out_transformers(name, estimator_orig, "xarray", + make_array, check_array_out) diff --git a/sklearn/utils/tests/test_array_out.py b/sklearn/utils/tests/test_array_out.py new file mode 100644 index 0000000000000..57ed976eb6556 --- /dev/null +++ b/sklearn/utils/tests/test_array_out.py @@ -0,0 +1,172 @@ +import pytest +import numpy as np +from numpy.testing import assert_array_equal +from scipy.sparse import csr_matrix + +from sklearn import config_context +from sklearn.utils._array_out import _make_array_out +from sklearn.utils._array_out import _get_feature_names +from sklearn.utils._testing import assert_allclose_dense_sparse + + +@pytest.mark.parametrize("X", [ + np.array([[1, 2, 3], [4, 5, 6]]), + [[1, 2, 3], [4, 5, 6]], + csr_matrix([[1, 0, 0], [0, 0, 1]]) +], ids=['ndarray', 'list', 'sparse']) +def test_feature_names_no_names(X): + assert _get_feature_names(X) is None + + +@pytest.mark.parametrize("column_name", [ + "columns", "my_special_name" +]) +def test_feature_names_xarray(column_name): + # the column names will always be the second axes + xr = pytest.importorskip("xarray") + X = np.array([[1, 2, 3], [4, 5, 6]]) + feature_names = [f"feature_{i}" for i in range(3)] + X = xr.DataArray(X, dims=("index", column_name), + coords={column_name: feature_names}) + + names = _get_feature_names(X) + assert_array_equal(names, feature_names) + + +def test_feature_names_pandas(): + pd = pytest.importorskip("pandas") + X = np.array([[1, 2, 3], [4, 5, 6]]) + feature_names = [f"feature_{i}" for i in range(3)] + X = pd.DataFrame(X, columns=feature_names) + + names = _get_feature_names(X) + assert_array_equal(names, feature_names) + + +@pytest.mark.parametrize("X_out", [ + np.array([[1, 2, 3], [2, 3, 4]]), + csr_matrix([[1, 0, 0], [0, 0, 1]]) +], ids=['ndarray', 'sparse']) +def test_make_array_out_default(X_out): + # array_out='default' is an noop + X_orig = np.ones((2, 10)) + with config_context(array_out='default'): + out = _make_array_out(X_out, X_orig, lambda names: names) + assert out is X_out + + +@pytest.mark.parametrize("X_out", [ + np.array([[1, 2, 3], [2, 3, 4]]), + csr_matrix([[1, 0, 0], [0, 0, 1]]) +], ids=['ndarray', 'sparse']) +def test_make_array_out_error(X_out): + X_orig = np.ones((2, 10)) + msg = "array_out must be 'default', 'pandas' or 'xarray'" + with config_context(array_out='bad'): + with pytest.raises(ValueError, match=msg): + _make_array_out(X_out, X_orig, lambda names: names) + + +@pytest.mark.parametrize("is_sparse", [True, False]) +@pytest.mark.parametrize("out_features", [ + ['feat_1', 'feat_2'], None +]) +def test_make_array_out_pandas(is_sparse, out_features): + pd = pytest.importorskip("pandas") + + index = [2, 3] + X_orig = pd.DataFrame(np.array([[1, 2, 3], [3, 4, 5]]), + columns=[f'col_{i}' for i in range(3)], + index=index) + + X_out = np.array([[0, 1], [1, 0]]) + if is_sparse: + X_out = csr_matrix(X_out) + + with config_context(array_out="pandas"): + df_out = _make_array_out(X_out, X_orig, lambda: out_features) + + assert isinstance(df_out, pd.DataFrame) + assert_array_equal(df_out.index, index) + + if out_features is None: + # default output feature names + assert_array_equal(df_out.columns, ["X0", "X1"]) + else: + assert_array_equal(df_out.columns, out_features) + + if is_sparse: + unwrapped = df_out.sparse.to_coo() + else: + unwrapped = df_out.to_numpy() + + assert_allclose_dense_sparse(X_out, unwrapped) + + +def test_make_array_out_pandas_zero_features(): + pd = pytest.importorskip("pandas") + + index = [2, 3] + X_orig = pd.DataFrame(np.array([[1, 2, 3], [3, 4, 5]]), + columns=[f'col_{i}' for i in range(3)], + index=index) + + X_out = np.zeros((2, 0)) + with config_context(array_out="pandas"): + df_out = _make_array_out(X_out, X_orig, lambda: []) + assert isinstance(df_out, pd.DataFrame) + assert_array_equal(df_out.index, index) + + +@pytest.mark.parametrize("is_sparse", [True, False]) +@pytest.mark.parametrize("out_features", [ + ['feat_1', 'feat_2'], None +]) +def test_make_array_out_xarray(is_sparse, out_features): + xr = pytest.importorskip("xarray") + if is_sparse: + pytest.importorskip("sparse") + + index = [2, 3] + X_orig = xr.DataArray(np.array([[1, 2, 3], [3, 4, 5]]), + dims=("index", "columns"), + coords={"index": index, + "columns": [f"col_{i}" for i in range(3)]}) + + X_out = np.array([[0, 1], [1, 0]]) + if is_sparse: + X_out = csr_matrix(X_out) + + with config_context(array_out="xarray"): + df_out = _make_array_out(X_out, X_orig, lambda: out_features) + + assert isinstance(df_out, xr.DataArray) + assert_array_equal(df_out.coords["index"], index) + + if out_features is None: + # default output feature names + assert_array_equal(df_out.coords["columns"], ["X0", "X1"]) + else: + assert_array_equal(df_out.coords["columns"], out_features) + + unwrapped = df_out.data + if is_sparse: + unwrapped = unwrapped.to_scipy_sparse() + + assert_allclose_dense_sparse(X_out, unwrapped) + + +def test_make_array_out_xarray_zero_features(): + xr = pytest.importorskip("xarray") + + index = [2, 3] + X_orig = xr.DataArray(np.array([[1, 2, 3], [3, 4, 5]]), + dims=("index", "columns"), + coords={"index": index, + "columns": [f"col_{i}" for i in range(3)]}) + + X_out = np.zeros((2, 0)) + with config_context(array_out="xarray"): + df_out = _make_array_out(X_out, X_orig, lambda: []) + assert isinstance(df_out, xr.DataArray) + assert_array_equal(df_out.coords["index"], index) diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index a0c88ef36e281..8ef05d8044abd 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -564,9 +564,8 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, estimator_name = "Estimator" context = " by %s" % estimator_name if estimator is not None else "" - # When all dataframe columns are sparse, convert to a sparse array + # handles pandas sparse by checking for sparse attribute if hasattr(array, 'sparse') and array.ndim > 1: - # DataFrame.sparse only supports `to_coo` array = array.sparse.to_coo() if array.dtype == np.dtype('object'): unique_dtypes = set( @@ -580,6 +579,39 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, "Sparse extension arrays should all have the same " "numeric type.") + # pydata/sparse + # or experimental pandas with pydata/sparse based on + # https://github.com/TomAugspurger/pandas/tree/33182-sparse-block + if (hasattr(array, 'to_scipy_sparse') or + (hasattr(array, 'data') and + hasattr(array.data, 'to_scipy_sparse'))): + + if hasattr(array.data, 'to_scipy_sparse'): + # xarray wrapping pydata/sparse array + sparse_array = array.data + else: + # pure pydata/sparse array + sparse_array = array + first_sparse = accept_sparse + + # be smart about format + if isinstance(accept_sparse, bool): + if not accept_sparse: + raise ValueError("Why are you passing a sparse here?") + first_sparse = 'csr' + + if isinstance(accept_sparse, (list, tuple)): + first_sparse = first_sparse[0] + + if first_sparse == 'csr': + array = sparse_array.tocsr() + elif first_sparse == 'csc': + array = sparse_array.tocsc() + elif first_sparse == 'coo': + array = sparse_array.to_scipy_sparse() + else: + array = sparse_array.tocsr() + if sp.issparse(array): _ensure_no_complex_data(array) array = _ensure_sparse_format(array, accept_sparse=accept_sparse, @@ -670,8 +702,13 @@ def check_array(array, accept_sparse=False, *, accept_large_sparse=True, % (n_features, array.shape, ensure_min_features, context)) - if copy and np.may_share_memory(array, array_orig): - array = np.array(array, dtype=dtype, order=order) + # not xarray with pydata/sparse because may_share_memory invokes the + # `__array__` protocol + if not (hasattr(array_orig, 'to_scipy_sparse') or + (hasattr(array, 'data') and + hasattr(array.data, 'to_scipy_sparse'))): + if copy and np.may_share_memory(array, array_orig): + array = np.array(array, dtype=dtype, order=order) return array