From 61b4ba507b780c6a6207201afe67f54d765987b7 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sat, 6 Nov 2021 20:32:41 +1300 Subject: [PATCH 01/14] Add get_feature_names_out to FunctionTransformer --- doc/whats_new/v1.1.rst | 7 + .../preprocessing/_function_transformer.py | 75 +++++++- .../tests/test_function_transformer.py | 161 ++++++++++++++++++ 3 files changed, 242 insertions(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index afc8c269baa23..eaaddc2fed3c0 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -163,6 +163,13 @@ Changelog instead of `__init__`. :pr:`21434` by :user:`Krum Arnaudov `. +- |Enhancement| Added the `get_feature_names_out` method and a new parameter + `feature_names_out` to :class:`preprocessing.FunctionTransformer`. By default, + `get_feature_names_out` returns the input feature names, but you can set + `feature_names_out` to return a different list, which is especially useful + when the number of output features differs from the number of input features. + by :user:`Aurélien Geron `. + :mod:`sklearn.svm` .................. diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 595ca0e0bbc1b..ee998a5c68de3 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -1,7 +1,12 @@ import warnings from ..base import BaseEstimator, TransformerMixin -from ..utils.validation import _allclose_dense_sparse, check_array +from ..utils.validation import ( + _allclose_dense_sparse, + _check_feature_names_in, + check_array, + column_or_1d, +) def _identity(X): @@ -61,6 +66,17 @@ class FunctionTransformer(TransformerMixin, BaseEstimator): .. versionadded:: 0.20 + feature_names_out : array-like of str, or callable, or None, default=None + Determines the list of feature names that will be returned by the + `get_feature_names_out` method. If you pass a callable, then it must + take two positional arguments: this `FunctionTransformer` (`self`) and + an array-like of input feature names (`input_features`). It must return + an array-like of output feature names. + + See ``get_feature_names_out`` for more details. + + .. versionadded:: 1.1 + kw_args : dict, default=None Dictionary of additional keyword arguments to pass to func. @@ -113,6 +129,7 @@ def __init__( validate=False, accept_sparse=False, check_inverse=True, + feature_names_out=None, kw_args=None, inv_kw_args=None, ): @@ -121,6 +138,7 @@ def __init__( self.validate = validate self.accept_sparse = accept_sparse self.check_inverse = check_inverse + self.feature_names_out = feature_names_out self.kw_args = kw_args self.inv_kw_args = inv_kw_args @@ -198,6 +216,61 @@ def inverse_transform(self, X): X = check_array(X, accept_sparse=self.accept_sparse) return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args) + def get_feature_names_out(self, input_features=None): + """Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input feature names. + + - If `input_features` is None, then `feature_names_in_` is + used as the input feature names. If `feature_names_in_` is not + defined, then names are generated: + `[x0, x1, ..., x(n_features_in_)]`. + - If `input_features` is array-like, then `input_features` must + match `feature_names_in_` if `feature_names_in_` is defined. + + Returns + ------- + feature_names_out : ndarray of str objects + Transformed feature names. + + - If `feature_names_out` is None, the input feature names are + returned (see `input_features` above). This requires + `n_features_in_` to be defined, which in turn requires + `validate=True`. + - If `feature_names_out` is an array-like of strings, then it + is returned, ignoring the input feature names. + - If `feature_names_out` is a callable, then it is called with two + arguments, `self` and `input_features`, and its return value is + returned by this method. + """ + if hasattr(self, "n_features_in_") or input_features is not None: + input_features = _check_feature_names_in(self, input_features) + elif input_features is not None: + input_features = column_or_1d(input_features) + if self.feature_names_out is None: + if input_features is None: + raise ValueError( + "If 'feature_names_out' is None, then 'input_features' " + "must be passed, or 'n_features_in_' must be defined. If " + "you set 'validate' to 'True', then 'n_features_in_' will " + "be set automatically when 'fit' is called." + ) + names_out = input_features + elif callable(self.feature_names_out): + names_out = self.feature_names_out(self, input_features) + elif isinstance(self.feature_names_out, str): + raise ValueError( + "'feature_names_out' must not be a string. If there is a " + "single output feature name, then set 'feature_names_out' to " + "an array-like containing just that name." + ) + else: + names_out = self.feature_names_out + return column_or_1d(names_out) + def _transform(self, X, func=None, kw_args=None): if func is None: func = _identity diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index b1ba9ebe6b762..1e31ca8329500 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -176,6 +176,167 @@ def test_function_transformer_frame(): assert hasattr(X_df_trans, "loc") +@pytest.mark.parametrize( + "X, feature_names_out, input_features, expected", + [ + ( + # NumPy inputs, default behavior: generate names + np.random.rand(100, 3), + None, + None, + ("x0", "x1", "x2"), + ), + ( + # Pandas input, default behavior: use input feature names + {"a": np.random.rand(100), "b": np.random.rand(100)}, + None, + None, + ("a", "b"), + ), + ( + # NumPy inputs, feature_names_out=list of names + np.random.rand(100, 3), + ("a", "b", "c", "d", "e"), + None, + ("a", "b", "c", "d", "e"), + ), + ( + # Pandas input, feature_names_out= list of names + {"a": np.random.rand(100), "b": np.random.rand(100)}, + ("c", "d", "e"), + None, + ("c", "d", "e"), + ), + ( + # NumPy input, feature_names_out=callable + np.random.rand(100, 3), + lambda transformer, input_features: ("a", "b"), + None, + ("a", "b"), + ), + ( + # Pandas input, feature_names_out=callable + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: ("c", "d", "e"), + None, + ("c", "d", "e"), + ), + ( + # NumPy input, feature_names_out=callable – default input_features + np.random.rand(100, 3), + lambda transformer, input_features: tuple(input_features) + ("a",), + None, + ("x0", "x1", "x2", "a"), + ), + ( + # Pandas input, feature_names_out=callable – default input_features + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: tuple(input_features) + ("c",), + None, + ("a", "b", "c"), + ), + ( + # NumPy input, input_features=list of names + np.random.rand(100, 3), + None, + ("a", "b", "c"), + ("a", "b", "c"), + ), + ( + # Pandas input, input_features=list of names + {"a": np.random.rand(100), "b": np.random.rand(100)}, + None, + ("a", "b"), # must match feature_names_in_ + ("a", "b"), + ), + ( + # NumPy input, both feature_names_out and input_features are names + np.random.rand(100, 3), + ("c", "d"), + ("e", "f", "g"), + ("c", "d"), + ), + ( + # Pandas input, both feature_names_out and input_features are names + {"a": np.random.rand(100), "b": np.random.rand(100)}, + ("c", "d", "e"), + ("a", "b"), # must match feature_names_in_ + ("c", "d", "e"), + ), + ( + # NumPy input, feature_names_out=callable, input_features=list + np.random.rand(100, 3), + lambda transformer, input_features: tuple(input_features) + ("d",), + ("a", "b", "c"), + ("a", "b", "c", "d"), + ), + ( + # Pandas input, feature_names_out=callable, input_features=list + {"a": np.random.rand(100), "b": np.random.rand(100)}, + lambda transformer, input_features: tuple(input_features) + ("c",), + ("a", "b"), # must match feature_names_in_ + ("a", "b", "c"), + ), + ], +) +def test_function_transformer_get_feature_names_out( + X, feature_names_out, input_features, expected +): + if isinstance(X, dict): + pd = pytest.importorskip("pandas") + X = pd.DataFrame(X) + + transformer = FunctionTransformer( + feature_names_out=feature_names_out, validate=True + ) + X_trans = transformer.fit_transform(X) + assert tuple(transformer.get_feature_names_out(input_features)) == expected + + +def test_function_transformer_get_feature_names_out_without_validation(): + transformer = FunctionTransformer(validate=False) + X = np.random.rand(100, 2) + X_trans = transformer.fit_transform(X) + + msg = "'n_features_in_' must be defined" + with pytest.raises(ValueError, match=msg) as excinfo: + transformer.get_feature_names_out() + + assert tuple(transformer.get_feature_names_out(["a", "b"])) == ("a", "b") + + +def test_function_transformer_feature_names_out_string(): + transformer = FunctionTransformer(feature_names_out="x0") + X = np.random.rand(100, 2) + X_trans = transformer.fit_transform(X) + + msg = "'feature_names_out' must not be a string" + with pytest.raises(ValueError, match=msg) as excinfo: + transformer.get_feature_names_out() + + +def test_function_transformer_feature_names_out_uses_estimator(): + def add_n_random_features(X, n): + return np.concatenate([X, np.random.rand(len(X), n)], axis=1) + + def feature_names_out(transformer, input_features): + n = transformer.kw_args["n"] + return list(input_features) + [f"rnd{i}" for i in range(n)] + + transformer = FunctionTransformer( + func=add_n_random_features, + feature_names_out=feature_names_out, + kw_args=dict(n=3), + validate=True, + ) + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)}) + X_trans = transformer.fit_transform(df) + names_out = transformer.get_feature_names_out() + + assert tuple(names_out) == ("a", "b", "rnd0", "rnd1", "rnd2") + + def test_function_transformer_validate_inverse(): """Test that function transformer does not reset estimator in `inverse_transform`.""" From faddcf58f3c94ee3ceb12946eb7e53961a260bcc Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sat, 6 Nov 2021 21:17:03 +1300 Subject: [PATCH 02/14] Add PR number --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index eaaddc2fed3c0..c74bb67f574e3 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -168,7 +168,7 @@ Changelog `get_feature_names_out` returns the input feature names, but you can set `feature_names_out` to return a different list, which is especially useful when the number of output features differs from the number of input features. - by :user:`Aurélien Geron `. + :pr:`21569` by :user:`Aurélien Geron `. :mod:`sklearn.svm` .................. From c4827692a0dba50821dbf24e1fa24ff9d910124b Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sat, 6 Nov 2021 21:23:05 +1300 Subject: [PATCH 03/14] Remove unused variables --- .../preprocessing/tests/test_function_transformer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 1e31ca8329500..65c70d9d1dc26 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -289,17 +289,17 @@ def test_function_transformer_get_feature_names_out( transformer = FunctionTransformer( feature_names_out=feature_names_out, validate=True ) - X_trans = transformer.fit_transform(X) + transformer.fit_transform(X) assert tuple(transformer.get_feature_names_out(input_features)) == expected def test_function_transformer_get_feature_names_out_without_validation(): transformer = FunctionTransformer(validate=False) X = np.random.rand(100, 2) - X_trans = transformer.fit_transform(X) + transformer.fit_transform(X) msg = "'n_features_in_' must be defined" - with pytest.raises(ValueError, match=msg) as excinfo: + with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out() assert tuple(transformer.get_feature_names_out(["a", "b"])) == ("a", "b") @@ -308,10 +308,10 @@ def test_function_transformer_get_feature_names_out_without_validation(): def test_function_transformer_feature_names_out_string(): transformer = FunctionTransformer(feature_names_out="x0") X = np.random.rand(100, 2) - X_trans = transformer.fit_transform(X) + transformer.fit_transform(X) msg = "'feature_names_out' must not be a string" - with pytest.raises(ValueError, match=msg) as excinfo: + with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out() @@ -331,7 +331,7 @@ def feature_names_out(transformer, input_features): ) pd = pytest.importorskip("pandas") df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)}) - X_trans = transformer.fit_transform(df) + transformer.fit_transform(df) names_out = transformer.get_feature_names_out() assert tuple(names_out) == ("a", "b", "rnd0", "rnd1", "rnd2") From 012db735a247b8b9a965e833ef055790cb71129f Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sat, 6 Nov 2021 21:46:18 +1300 Subject: [PATCH 04/14] Add missing ` (unrelated to this PR) --- doc/whats_new/v1.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index c74bb67f574e3..7ec038e2f9d0a 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -109,7 +109,7 @@ Changelog - |API| The `verbose` parameter was deprecated for :class:`impute.SimpleImputer`. A warning will always be raised upon the removal of empty columns. :pr:`21448` by :user:`Oleh Kozynets ` and - :user:`Christian Ritter . + :user:`Christian Ritter `. - |Fix| Fix a bug in :class:`linear_model.RidgeClassifierCV` where the method `predict` was performing an `argmax` on the scores obtained from From 2f0aac01921df5a016b7d49b364f3b465e62d664 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sat, 6 Nov 2021 22:42:13 +1300 Subject: [PATCH 05/14] Remove unreachable elif block --- sklearn/preprocessing/_function_transformer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index ee998a5c68de3..0a639c75d3959 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -248,8 +248,6 @@ def get_feature_names_out(self, input_features=None): """ if hasattr(self, "n_features_in_") or input_features is not None: input_features = _check_feature_names_in(self, input_features) - elif input_features is not None: - input_features = column_or_1d(input_features) if self.feature_names_out is None: if input_features is None: raise ValueError( From f91539a47cef3190e66ea91bf93fdbddcb46683d Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 7 Nov 2021 10:32:56 +1300 Subject: [PATCH 06/14] Remove option to set feature_names_out to an array-like of str, and make default 'one-to-one' --- doc/whats_new/v1.1.rst | 5 ++- .../preprocessing/_function_transformer.py | 38 ++++++++-------- .../tests/test_function_transformer.py | 45 ++++--------------- 3 files changed, 31 insertions(+), 57 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 7ec038e2f9d0a..edc705cc83fca 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -166,8 +166,9 @@ Changelog - |Enhancement| Added the `get_feature_names_out` method and a new parameter `feature_names_out` to :class:`preprocessing.FunctionTransformer`. By default, `get_feature_names_out` returns the input feature names, but you can set - `feature_names_out` to return a different list, which is especially useful - when the number of output features differs from the number of input features. + `feature_names_out` to a callable that returns the output feature names + instead. This is especially useful when the transformer changes the number of + features. :pr:`21569` by :user:`Aurélien Geron `. :mod:`sklearn.svm` diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 0a639c75d3959..b5c7b7de3fc6e 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -66,12 +66,14 @@ class FunctionTransformer(TransformerMixin, BaseEstimator): .. versionadded:: 0.20 - feature_names_out : array-like of str, or callable, or None, default=None + feature_names_out : callable or 'one-to-one', default='one-to-one' Determines the list of feature names that will be returned by the - `get_feature_names_out` method. If you pass a callable, then it must - take two positional arguments: this `FunctionTransformer` (`self`) and - an array-like of input feature names (`input_features`). It must return - an array-like of output feature names. + `get_feature_names_out` method. If it is 'one-to-one', then the output + feature names will be equal to the input feature names. If it is a + callable, then it must take two positional arguments: this + `FunctionTransformer` (`self`) and an array-like of input feature names + (`input_features`). It must return an array-like of output feature + names. See ``get_feature_names_out`` for more details. @@ -129,7 +131,7 @@ def __init__( validate=False, accept_sparse=False, check_inverse=True, - feature_names_out=None, + feature_names_out="one-to-one", kw_args=None, inv_kw_args=None, ): @@ -236,19 +238,18 @@ def get_feature_names_out(self, input_features=None): feature_names_out : ndarray of str objects Transformed feature names. - - If `feature_names_out` is None, the input feature names are - returned (see `input_features` above). This requires - `n_features_in_` to be defined, which in turn requires - `validate=True`. - - If `feature_names_out` is an array-like of strings, then it - is returned, ignoring the input feature names. + - If `feature_names_out` is 'one-to-one', the input feature names + are returned (see `input_features` above). This requires + `feature_names_in_` to be defined (or at least `n_features_in_`), + which is done automatically if `validate=True`, or you can set + them in `func`. - If `feature_names_out` is a callable, then it is called with two arguments, `self` and `input_features`, and its return value is returned by this method. """ if hasattr(self, "n_features_in_") or input_features is not None: input_features = _check_feature_names_in(self, input_features) - if self.feature_names_out is None: + if self.feature_names_out == "one-to-one": if input_features is None: raise ValueError( "If 'feature_names_out' is None, then 'input_features' " @@ -259,14 +260,13 @@ def get_feature_names_out(self, input_features=None): names_out = input_features elif callable(self.feature_names_out): names_out = self.feature_names_out(self, input_features) - elif isinstance(self.feature_names_out, str): + else: raise ValueError( - "'feature_names_out' must not be a string. If there is a " - "single output feature name, then set 'feature_names_out' to " - "an array-like containing just that name." + "'feature_names_out' must either be \"one-to-one\" or a " + "callable with two arguments: the function transformer and " + "an array-like of input feature names. The callable must " + "return an array-like of output feature names." ) - else: - names_out = self.feature_names_out return column_or_1d(names_out) def _transform(self, X, func=None, kw_args=None): diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 65c70d9d1dc26..0a07a8821c54f 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -182,31 +182,17 @@ def test_function_transformer_frame(): ( # NumPy inputs, default behavior: generate names np.random.rand(100, 3), - None, + "one-to-one", None, ("x0", "x1", "x2"), ), ( # Pandas input, default behavior: use input feature names {"a": np.random.rand(100), "b": np.random.rand(100)}, - None, + "one-to-one", None, ("a", "b"), ), - ( - # NumPy inputs, feature_names_out=list of names - np.random.rand(100, 3), - ("a", "b", "c", "d", "e"), - None, - ("a", "b", "c", "d", "e"), - ), - ( - # Pandas input, feature_names_out= list of names - {"a": np.random.rand(100), "b": np.random.rand(100)}, - ("c", "d", "e"), - None, - ("c", "d", "e"), - ), ( # NumPy input, feature_names_out=callable np.random.rand(100, 3), @@ -238,31 +224,17 @@ def test_function_transformer_frame(): ( # NumPy input, input_features=list of names np.random.rand(100, 3), - None, + "one-to-one", ("a", "b", "c"), ("a", "b", "c"), ), ( # Pandas input, input_features=list of names {"a": np.random.rand(100), "b": np.random.rand(100)}, - None, + "one-to-one", ("a", "b"), # must match feature_names_in_ ("a", "b"), ), - ( - # NumPy input, both feature_names_out and input_features are names - np.random.rand(100, 3), - ("c", "d"), - ("e", "f", "g"), - ("c", "d"), - ), - ( - # Pandas input, both feature_names_out and input_features are names - {"a": np.random.rand(100), "b": np.random.rand(100)}, - ("c", "d", "e"), - ("a", "b"), # must match feature_names_in_ - ("c", "d", "e"), - ), ( # NumPy input, feature_names_out=callable, input_features=list np.random.rand(100, 3), @@ -302,15 +274,16 @@ def test_function_transformer_get_feature_names_out_without_validation(): with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out() - assert tuple(transformer.get_feature_names_out(["a", "b"])) == ("a", "b") + assert tuple(transformer.get_feature_names_out(("a", "b"))) == ("a", "b") -def test_function_transformer_feature_names_out_string(): - transformer = FunctionTransformer(feature_names_out="x0") +@pytest.mark.parametrize("feature_names_out", ['x0', ['x0'], ('x0',)]) +def test_function_transformer_feature_names_out_string(feature_names_out): + transformer = FunctionTransformer(feature_names_out=feature_names_out) X = np.random.rand(100, 2) transformer.fit_transform(X) - msg = "'feature_names_out' must not be a string" + msg = """'feature_names_out' must either be "one-to-one" or a callable""" with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out() From d3882f29aab0d0ee0aa4e0ca08b0b83f99a71e48 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 7 Nov 2021 11:19:27 +1300 Subject: [PATCH 07/14] Default feature_names_out to None instead of 'one-to-one' --- doc/whats_new/v1.1.rst | 8 +++--- .../preprocessing/_function_transformer.py | 28 ++++++++++++------- .../tests/test_function_transformer.py | 6 ++-- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index edc705cc83fca..2b9d8bb3a27a7 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -165,10 +165,10 @@ Changelog - |Enhancement| Added the `get_feature_names_out` method and a new parameter `feature_names_out` to :class:`preprocessing.FunctionTransformer`. By default, - `get_feature_names_out` returns the input feature names, but you can set - `feature_names_out` to a callable that returns the output feature names - instead. This is especially useful when the transformer changes the number of - features. + `get_feature_names_out` raises an exception, but you can set + `feature_names_out` to 'one-to-one' to return the input features, or to a + callable that returns the output feature names. This is especially useful + when the transformer changes the number of features. :pr:`21569` by :user:`Aurélien Geron `. :mod:`sklearn.svm` diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index b5c7b7de3fc6e..2a395e21c5114 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -66,14 +66,15 @@ class FunctionTransformer(TransformerMixin, BaseEstimator): .. versionadded:: 0.20 - feature_names_out : callable or 'one-to-one', default='one-to-one' + feature_names_out : callable, 'one-to-one' or None, default=None Determines the list of feature names that will be returned by the `get_feature_names_out` method. If it is 'one-to-one', then the output feature names will be equal to the input feature names. If it is a callable, then it must take two positional arguments: this `FunctionTransformer` (`self`) and an array-like of input feature names (`input_features`). It must return an array-like of output feature - names. + names. If it is None, then `get_feature_names_out` will raise a + ValueError. See ``get_feature_names_out`` for more details. @@ -131,7 +132,7 @@ def __init__( validate=False, accept_sparse=False, check_inverse=True, - feature_names_out="one-to-one", + feature_names_out=None, kw_args=None, inv_kw_args=None, ): @@ -240,22 +241,29 @@ def get_feature_names_out(self, input_features=None): - If `feature_names_out` is 'one-to-one', the input feature names are returned (see `input_features` above). This requires - `feature_names_in_` to be defined (or at least `n_features_in_`), - which is done automatically if `validate=True`, or you can set - them in `func`. + `feature_names_in_` and/or `n_features_in_` to be defined, which + is done automatically if `validate=True`. Alternatively, you can + set them in `func`. - If `feature_names_out` is a callable, then it is called with two arguments, `self` and `input_features`, and its return value is returned by this method. + + Raises + ------ + ValueError + When `feature_names_out` is None. """ if hasattr(self, "n_features_in_") or input_features is not None: input_features = _check_feature_names_in(self, input_features) if self.feature_names_out == "one-to-one": if input_features is None: raise ValueError( - "If 'feature_names_out' is None, then 'input_features' " - "must be passed, or 'n_features_in_' must be defined. If " - "you set 'validate' to 'True', then 'n_features_in_' will " - "be set automatically when 'fit' is called." + "When 'feature_names_out' is 'one-to-one', either " + "'input_features' must be passed, or 'feature_names_in_' " + "and/or 'n_features_in_' must be defined. If you set " + "'validate' to 'True', then they will be defined " + "automatically when 'fit' is called. Alternatively, you " + "can set them in 'func'." ) names_out = input_features elif callable(self.feature_names_out): diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 0a07a8821c54f..78b971001358e 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -266,18 +266,18 @@ def test_function_transformer_get_feature_names_out( def test_function_transformer_get_feature_names_out_without_validation(): - transformer = FunctionTransformer(validate=False) + transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False) X = np.random.rand(100, 2) transformer.fit_transform(X) - msg = "'n_features_in_' must be defined" + msg = "When 'feature_names_out' is 'one-to-one', either" with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out() assert tuple(transformer.get_feature_names_out(("a", "b"))) == ("a", "b") -@pytest.mark.parametrize("feature_names_out", ['x0', ['x0'], ('x0',)]) +@pytest.mark.parametrize("feature_names_out", ["x0", ["x0"], ("x0",)]) def test_function_transformer_feature_names_out_string(feature_names_out): transformer = FunctionTransformer(feature_names_out=feature_names_out) X = np.random.rand(100, 2) From 6e7f13b3ab23334d756c0b64b3a5004cc62168ce Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 7 Nov 2021 11:31:31 +1300 Subject: [PATCH 08/14] Use available_if to ensure get_feature_names_out is not defined when feature_names_out=None --- doc/whats_new/v1.1.rst | 11 ++++++----- sklearn/preprocessing/_function_transformer.py | 13 +++++-------- .../tests/test_function_transformer.py | 10 ++++++++++ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 2b9d8bb3a27a7..726e7bf4531e9 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -164,11 +164,12 @@ Changelog :pr:`21434` by :user:`Krum Arnaudov `. - |Enhancement| Added the `get_feature_names_out` method and a new parameter - `feature_names_out` to :class:`preprocessing.FunctionTransformer`. By default, - `get_feature_names_out` raises an exception, but you can set - `feature_names_out` to 'one-to-one' to return the input features, or to a - callable that returns the output feature names. This is especially useful - when the transformer changes the number of features. + `feature_names_out` to :class:`preprocessing.FunctionTransformer`. You can set + `feature_names_out` to 'one-to-one' to use the input features names as the + output feature names, or you can set it to a callable that returns the output + feature names. This is especially useful when the transformer changes the + number of features. If `feature_names_out` is None (which is the default), + then `get_output_feature_names` is not defined. :pr:`21569` by :user:`Aurélien Geron `. :mod:`sklearn.svm` diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 2a395e21c5114..dd627aa472052 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -1,6 +1,7 @@ import warnings from ..base import BaseEstimator, TransformerMixin +from ..utils.metaestimators import available_if from ..utils.validation import ( _allclose_dense_sparse, _check_feature_names_in, @@ -8,7 +9,6 @@ column_or_1d, ) - def _identity(X): """The identity function.""" return X @@ -73,8 +73,8 @@ class FunctionTransformer(TransformerMixin, BaseEstimator): callable, then it must take two positional arguments: this `FunctionTransformer` (`self`) and an array-like of input feature names (`input_features`). It must return an array-like of output feature - names. If it is None, then `get_feature_names_out` will raise a - ValueError. + names. The `get_feature_names_out` method is only defined if + `feature_names_out` is not None. See ``get_feature_names_out`` for more details. @@ -219,8 +219,10 @@ def inverse_transform(self, X): X = check_array(X, accept_sparse=self.accept_sparse) return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args) + @available_if(lambda transformer: transformer.feature_names_out is not None) def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. + This method is only defined if `feature_names_out` is not None. Parameters ---------- @@ -247,11 +249,6 @@ def get_feature_names_out(self, input_features=None): - If `feature_names_out` is a callable, then it is called with two arguments, `self` and `input_features`, and its return value is returned by this method. - - Raises - ------ - ValueError - When `feature_names_out` is None. """ if hasattr(self, "n_features_in_") or input_features is not None: input_features = _check_feature_names_in(self, input_features) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index 78b971001358e..afb009b8a460f 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -288,6 +288,16 @@ def test_function_transformer_feature_names_out_string(feature_names_out): transformer.get_feature_names_out() +def test_function_transformer_feature_names_out_is_None(): + transformer = FunctionTransformer() + X = np.random.rand(100, 2) + transformer.fit_transform(X) + + msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'" + with pytest.raises(AttributeError, match=msg): + transformer.get_feature_names_out() + + def test_function_transformer_feature_names_out_uses_estimator(): def add_n_random_features(X, n): return np.concatenate([X, np.random.rand(len(X), n)], axis=1) From f078eca8d034665b45f60f57069c797ff2b06ba7 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 7 Nov 2021 11:32:52 +1300 Subject: [PATCH 09/14] Add missing new line --- sklearn/preprocessing/_function_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index dd627aa472052..40d7fd2270236 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -9,6 +9,7 @@ column_or_1d, ) + def _identity(X): """The identity function.""" return X From 2d3fae7a9909215465efe9817af045d36ecfe7a2 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 7 Nov 2021 17:23:13 +1300 Subject: [PATCH 10/14] Add missing newline in method doc --- sklearn/preprocessing/_function_transformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 40d7fd2270236..93d45dc9782e7 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -223,6 +223,7 @@ def inverse_transform(self, X): @available_if(lambda transformer: transformer.feature_names_out is not None) def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. + This method is only defined if `feature_names_out` is not None. Parameters From 8726ab7a16f6ad25b9cd6abf6ddd11d929e8bb77 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Thu, 18 Nov 2021 09:41:42 +1300 Subject: [PATCH 11/14] Show value when feature_names_out is invalid --- sklearn/preprocessing/_function_transformer.py | 9 +++++---- sklearn/preprocessing/tests/test_function_transformer.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 93d45dc9782e7..c0d88dd30dec9 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -269,10 +269,11 @@ def get_feature_names_out(self, input_features=None): names_out = self.feature_names_out(self, input_features) else: raise ValueError( - "'feature_names_out' must either be \"one-to-one\" or a " - "callable with two arguments: the function transformer and " - "an array-like of input feature names. The callable must " - "return an array-like of output feature names." + f"feature_names_out={self.feature_names_out!r} is invalid. " + 'It must either be "one-to-one" or a callable with two ' + "arguments: the function transformer and an array-like of " + "input feature names. The callable must return an array-like " + "of output feature names." ) return column_or_1d(names_out) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index afb009b8a460f..a8efe0891c26a 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -283,7 +283,7 @@ def test_function_transformer_feature_names_out_string(feature_names_out): X = np.random.rand(100, 2) transformer.fit_transform(X) - msg = """'feature_names_out' must either be "one-to-one" or a callable""" + msg = """must either be "one-to-one" or a callable""" with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out() From 8cda16d522f22e6ab313abcccac115ada2c95393 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 28 Nov 2021 14:46:21 +1300 Subject: [PATCH 12/14] Replace lambda transformer:... with lambda self:... --- sklearn/preprocessing/_function_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index c0d88dd30dec9..2fd4d1d7fe037 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -220,7 +220,7 @@ def inverse_transform(self, X): X = check_array(X, accept_sparse=self.accept_sparse) return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args) - @available_if(lambda transformer: transformer.feature_names_out is not None) + @available_if(lambda self: self.feature_names_out is not None) def get_feature_names_out(self, input_features=None): """Get output feature names for transformation. From 0303c72b47e0207d9efd457cdb2ba45adfa4b2fd Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 28 Nov 2021 16:19:23 +1300 Subject: [PATCH 13/14] Convert names out to ndarray with dtype=object --- sklearn/preprocessing/_function_transformer.py | 5 +++-- sklearn/preprocessing/tests/test_function_transformer.py | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_function_transformer.py b/sklearn/preprocessing/_function_transformer.py index 2fd4d1d7fe037..cea720aeb6a67 100644 --- a/sklearn/preprocessing/_function_transformer.py +++ b/sklearn/preprocessing/_function_transformer.py @@ -1,12 +1,13 @@ import warnings +import numpy as np + from ..base import BaseEstimator, TransformerMixin from ..utils.metaestimators import available_if from ..utils.validation import ( _allclose_dense_sparse, _check_feature_names_in, check_array, - column_or_1d, ) @@ -275,7 +276,7 @@ def get_feature_names_out(self, input_features=None): "input feature names. The callable must return an array-like " "of output feature names." ) - return column_or_1d(names_out) + return np.asarray(names_out, dtype=object) def _transform(self, X, func=None, kw_args=None): if func is None: diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index a8efe0891c26a..e078e83328eec 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -262,7 +262,10 @@ def test_function_transformer_get_feature_names_out( feature_names_out=feature_names_out, validate=True ) transformer.fit_transform(X) - assert tuple(transformer.get_feature_names_out(input_features)) == expected + names = transformer.get_feature_names_out(input_features) + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, expected) def test_function_transformer_get_feature_names_out_without_validation(): From d8671e8a3369dadd080647c32123ce0c2a471f59 Mon Sep 17 00:00:00 2001 From: Aurelien Geron Date: Sun, 28 Nov 2021 16:25:52 +1300 Subject: [PATCH 14/14] Check names out are ndarray with dtype=object --- .../preprocessing/tests/test_function_transformer.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py index e078e83328eec..525accf4568de 100644 --- a/sklearn/preprocessing/tests/test_function_transformer.py +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -277,7 +277,10 @@ def test_function_transformer_get_feature_names_out_without_validation(): with pytest.raises(ValueError, match=msg): transformer.get_feature_names_out() - assert tuple(transformer.get_feature_names_out(("a", "b"))) == ("a", "b") + names = transformer.get_feature_names_out(("a", "b")) + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, ("a", "b")) @pytest.mark.parametrize("feature_names_out", ["x0", ["x0"], ("x0",)]) @@ -318,9 +321,11 @@ def feature_names_out(transformer, input_features): pd = pytest.importorskip("pandas") df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)}) transformer.fit_transform(df) - names_out = transformer.get_feature_names_out() + names = transformer.get_feature_names_out() - assert tuple(names_out) == ("a", "b", "rnd0", "rnd1", "rnd2") + assert isinstance(names, np.ndarray) + assert names.dtype == object + assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2")) def test_function_transformer_validate_inverse():