Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,15 @@ Changelog
instead of `__init__`.
:pr:`21434` by :user:`Krum Arnaudov <krumeto>`.

- |Enhancement| Added the `get_feature_names_out` method and a new parameter
`feature_names_out` to :class:`preprocessing.FunctionTransformer`. You can set
`feature_names_out` to 'one-to-one' to use the input features names as the
output feature names, or you can set it to a callable that returns the output
feature names. This is especially useful when the transformer changes the
number of features. If `feature_names_out` is None (which is the default),
then `get_output_feature_names` is not defined.
:pr:`21569` by :user:`Aurélien Geron <ageron>`.

:mod:`sklearn.svm`
..................

Expand Down
82 changes: 81 additions & 1 deletion sklearn/preprocessing/_function_transformer.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import warnings

import numpy as np

from ..base import BaseEstimator, TransformerMixin
from ..utils.validation import _allclose_dense_sparse, check_array
from ..utils.metaestimators import available_if
from ..utils.validation import (
_allclose_dense_sparse,
_check_feature_names_in,
check_array,
)


def _identity(X):
Expand Down Expand Up @@ -61,6 +68,20 @@ class FunctionTransformer(TransformerMixin, BaseEstimator):

.. versionadded:: 0.20

feature_names_out : callable, 'one-to-one' or None, default=None
Determines the list of feature names that will be returned by the
`get_feature_names_out` method. If it is 'one-to-one', then the output
feature names will be equal to the input feature names. If it is a
callable, then it must take two positional arguments: this
`FunctionTransformer` (`self`) and an array-like of input feature names
(`input_features`). It must return an array-like of output feature
names. The `get_feature_names_out` method is only defined if
`feature_names_out` is not None.

See ``get_feature_names_out`` for more details.

.. versionadded:: 1.1

kw_args : dict, default=None
Dictionary of additional keyword arguments to pass to func.

Expand Down Expand Up @@ -113,6 +134,7 @@ def __init__(
validate=False,
accept_sparse=False,
check_inverse=True,
feature_names_out=None,
kw_args=None,
inv_kw_args=None,
):
Expand All @@ -121,6 +143,7 @@ def __init__(
self.validate = validate
self.accept_sparse = accept_sparse
self.check_inverse = check_inverse
self.feature_names_out = feature_names_out
self.kw_args = kw_args
self.inv_kw_args = inv_kw_args

Expand Down Expand Up @@ -198,6 +221,63 @@ def inverse_transform(self, X):
X = check_array(X, accept_sparse=self.accept_sparse)
return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)

@available_if(lambda self: self.feature_names_out is not None)
def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.

This method is only defined if `feature_names_out` is not None.

Parameters
----------
input_features : array-like of str or None, default=None
Input feature names.

- If `input_features` is None, then `feature_names_in_` is
used as the input feature names. If `feature_names_in_` is not
defined, then names are generated:
`[x0, x1, ..., x(n_features_in_)]`.
- If `input_features` is array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.

Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.

- If `feature_names_out` is 'one-to-one', the input feature names
are returned (see `input_features` above). This requires
`feature_names_in_` and/or `n_features_in_` to be defined, which
is done automatically if `validate=True`. Alternatively, you can
set them in `func`.
- If `feature_names_out` is a callable, then it is called with two
arguments, `self` and `input_features`, and its return value is
returned by this method.
"""
if hasattr(self, "n_features_in_") or input_features is not None:
input_features = _check_feature_names_in(self, input_features)
if self.feature_names_out == "one-to-one":
if input_features is None:
raise ValueError(
"When 'feature_names_out' is 'one-to-one', either "
"'input_features' must be passed, or 'feature_names_in_' "
"and/or 'n_features_in_' must be defined. If you set "
"'validate' to 'True', then they will be defined "
"automatically when 'fit' is called. Alternatively, you "
"can set them in 'func'."
)
names_out = input_features
elif callable(self.feature_names_out):
names_out = self.feature_names_out(self, input_features)
else:
raise ValueError(
f"feature_names_out={self.feature_names_out!r} is invalid. "
'It must either be "one-to-one" or a callable with two '
"arguments: the function transformer and an array-like of "
"input feature names. The callable must return an array-like "
"of output feature names."
)
return np.asarray(names_out, dtype=object)

def _transform(self, X, func=None, kw_args=None):
if func is None:
func = _identity
Expand Down
152 changes: 152 additions & 0 deletions sklearn/preprocessing/tests/test_function_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,158 @@ def test_function_transformer_frame():
assert hasattr(X_df_trans, "loc")


@pytest.mark.parametrize(
"X, feature_names_out, input_features, expected",
[
(
# NumPy inputs, default behavior: generate names
np.random.rand(100, 3),
"one-to-one",
None,
("x0", "x1", "x2"),
),
(
# Pandas input, default behavior: use input feature names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
None,
("a", "b"),
),
(
# NumPy input, feature_names_out=callable
np.random.rand(100, 3),
lambda transformer, input_features: ("a", "b"),
None,
("a", "b"),
),
(
# Pandas input, feature_names_out=callable
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: ("c", "d", "e"),
None,
("c", "d", "e"),
),
(
# NumPy input, feature_names_out=callable – default input_features
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("a",),
None,
("x0", "x1", "x2", "a"),
),
(
# Pandas input, feature_names_out=callable – default input_features
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
None,
("a", "b", "c"),
),
(
# NumPy input, input_features=list of names
np.random.rand(100, 3),
"one-to-one",
("a", "b", "c"),
("a", "b", "c"),
),
(
# Pandas input, input_features=list of names
{"a": np.random.rand(100), "b": np.random.rand(100)},
"one-to-one",
("a", "b"), # must match feature_names_in_
("a", "b"),
),
(
# NumPy input, feature_names_out=callable, input_features=list
np.random.rand(100, 3),
lambda transformer, input_features: tuple(input_features) + ("d",),
("a", "b", "c"),
("a", "b", "c", "d"),
),
(
# Pandas input, feature_names_out=callable, input_features=list
{"a": np.random.rand(100), "b": np.random.rand(100)},
lambda transformer, input_features: tuple(input_features) + ("c",),
("a", "b"), # must match feature_names_in_
("a", "b", "c"),
),
],
)
def test_function_transformer_get_feature_names_out(
X, feature_names_out, input_features, expected
):
if isinstance(X, dict):
pd = pytest.importorskip("pandas")
X = pd.DataFrame(X)

transformer = FunctionTransformer(
feature_names_out=feature_names_out, validate=True
)
transformer.fit_transform(X)
names = transformer.get_feature_names_out(input_features)
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, expected)


def test_function_transformer_get_feature_names_out_without_validation():
transformer = FunctionTransformer(feature_names_out="one-to-one", validate=False)
X = np.random.rand(100, 2)
transformer.fit_transform(X)

msg = "When 'feature_names_out' is 'one-to-one', either"
with pytest.raises(ValueError, match=msg):
transformer.get_feature_names_out()

names = transformer.get_feature_names_out(("a", "b"))
assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b"))


@pytest.mark.parametrize("feature_names_out", ["x0", ["x0"], ("x0",)])
def test_function_transformer_feature_names_out_string(feature_names_out):
transformer = FunctionTransformer(feature_names_out=feature_names_out)
X = np.random.rand(100, 2)
transformer.fit_transform(X)

msg = """must either be "one-to-one" or a callable"""
with pytest.raises(ValueError, match=msg):
transformer.get_feature_names_out()


def test_function_transformer_feature_names_out_is_None():
transformer = FunctionTransformer()
X = np.random.rand(100, 2)
transformer.fit_transform(X)

msg = "This 'FunctionTransformer' has no attribute 'get_feature_names_out'"
with pytest.raises(AttributeError, match=msg):
transformer.get_feature_names_out()


def test_function_transformer_feature_names_out_uses_estimator():
def add_n_random_features(X, n):
return np.concatenate([X, np.random.rand(len(X), n)], axis=1)

def feature_names_out(transformer, input_features):
n = transformer.kw_args["n"]
return list(input_features) + [f"rnd{i}" for i in range(n)]

transformer = FunctionTransformer(
func=add_n_random_features,
feature_names_out=feature_names_out,
kw_args=dict(n=3),
validate=True,
)
pd = pytest.importorskip("pandas")
df = pd.DataFrame({"a": np.random.rand(100), "b": np.random.rand(100)})
transformer.fit_transform(df)
names = transformer.get_feature_names_out()

assert isinstance(names, np.ndarray)
assert names.dtype == object
assert_array_equal(names, ("a", "b", "rnd0", "rnd1", "rnd2"))


def test_function_transformer_validate_inverse():
"""Test that function transformer does not reset estimator in
`inverse_transform`."""
Expand Down