Skip to content

ENH Adds feature_names_out to impute module #21078

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Oct 21, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
b993b5e
ENH Adds feature_names_out to impute module
thomasjpfan Sep 17, 2021
3f95b5c
DOC Adds whats new
thomasjpfan Sep 17, 2021
6ebb43f
BUG Remove use for dtype in concatenate
thomasjpfan Sep 17, 2021
0e5b23a
Merge remote-tracking branch 'upstream/main' into feature_names_out_i…
thomasjpfan Sep 30, 2021
91ae2bb
MAINT Allows for multiple whitespace
thomasjpfan Oct 1, 2021
27bcfe3
Merge remote-tracking branch 'upstream/main' into feature_names_out_i…
thomasjpfan Oct 1, 2021
7915067
Merge remote-tracking branch 'upstream/main' into feature_names_out_i…
thomasjpfan Oct 3, 2021
f3a3ff8
Merge remote-tracking branch 'upstream/main' into feature_names_out_i…
thomasjpfan Oct 4, 2021
544770b
TST Adds non-missing feature in the middle
thomasjpfan Oct 4, 2021
52c2202
ENH Use missingindicator as prefix for indicator
thomasjpfan Oct 4, 2021
7170589
TST Remove covergence warning
thomasjpfan Oct 4, 2021
a87e4ca
Merge branch 'better_white_space_for_take'
thomasjpfan Oct 5, 2021
3b9c3c9
MAINT Whitespace
thomasjpfan Oct 5, 2021
ce801d0
Fix yaml
thomasjpfan Oct 5, 2021
a5dfc47
TST Testing
thomasjpfan Oct 5, 2021
ee5f2f7
TST Testing
thomasjpfan Oct 5, 2021
6719ad6
TST Testing
thomasjpfan Oct 5, 2021
d0f2474
TST Testing
thomasjpfan Oct 5, 2021
5ddd36d
TST Testing
thomasjpfan Oct 5, 2021
31f90ed
TST Testing
thomasjpfan Oct 5, 2021
218db3f
TST Testing
thomasjpfan Oct 5, 2021
57203a6
TST Testing
thomasjpfan Oct 5, 2021
e8c15e4
TST Testing
thomasjpfan Oct 5, 2021
3a2f25c
TST Testing
thomasjpfan Oct 5, 2021
ef134d1
TST Testing
thomasjpfan Oct 5, 2021
71e26ff
TST Testing
thomasjpfan Oct 5, 2021
1a75e5c
TST Testing
thomasjpfan Oct 5, 2021
df4b592
TST Testing
thomasjpfan Oct 5, 2021
5a422e6
TST Testing
thomasjpfan Oct 5, 2021
22d871b
TST Testing
thomasjpfan Oct 5, 2021
39c8e00
TST Testing
thomasjpfan Oct 5, 2021
df6fd9a
TST Testing
thomasjpfan Oct 6, 2021
f12da8c
Merge remote-tracking branch 'origin/main' into feature_names_out_impute
thomasjpfan Oct 13, 2021
8f34d4d
Merge branch 'main' into feature_names_out_impute
ogrisel Oct 13, 2021
6d86dc5
REV Remove unrelated diff
thomasjpfan Oct 13, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ Changelog
error when 'min_idf' or 'max_idf' are floating-point numbers greater than 1.
:pr:`20752` by :user:`Alek Lefebvre <AlekLefebvre>`.

:mod:`sklearn.impute`
.....................

- |API| Adds :meth:`get_feature_names_out` to :class:`impute.SimpleImputer`,
:class:`impute.KNNImputer`, :class:`impute.IterativeImputer`, and
:class:`impute.MissingIndicator`. :pr:`21078` by `Thomas Fan`_.

:mod:`sklearn.linear_model`
...........................

Expand Down
61 changes: 61 additions & 0 deletions sklearn/impute/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ..utils.sparsefuncs import _get_median
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
from ..utils.validation import _check_feature_names_in
from ..utils._mask import _get_mask
from ..utils import is_scalar_nan

Expand Down Expand Up @@ -113,6 +114,13 @@ def _concatenate_indicator(self, X_imputed, X_indicator):

return hstack((X_imputed, X_indicator))

def _concatenate_indicator_feature_names_out(self, names, input_features):
if not self.add_indicator:
return names

indicator_names = self.indicator_.get_feature_names_out(input_features)
return np.concatenate([names, indicator_names])

def _more_tags(self):
return {"allow_nan": is_scalar_nan(self.missing_values)}

Expand Down Expand Up @@ -596,6 +604,30 @@ def inverse_transform(self, X):
X_original[full_mask] = self.missing_values
return X_original

def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.

Parameters
----------
input_features : array-like of str or None, default=None
Input features.

- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.

Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
input_features = _check_feature_names_in(self, input_features)
non_missing_mask = np.logical_not(_get_mask(self.statistics_, np.nan))
names = input_features[non_missing_mask]
return self._concatenate_indicator_feature_names_out(names, input_features)


class MissingIndicator(TransformerMixin, BaseEstimator):
"""Binary indicators for missing values.
Expand Down Expand Up @@ -922,6 +954,35 @@ def fit_transform(self, X, y=None):

return imputer_mask

def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.

Parameters
----------
input_features : array-like of str or None, default=None
Input features.

- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.

Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
input_features = _check_feature_names_in(self, input_features)
prefix = self.__class__.__name__.lower()
return np.asarray(
[
f"{prefix}_{feature_name}"
for feature_name in input_features[self.features_]
],
dtype=object,
)

def _more_tags(self):
return {
"allow_nan": True,
Expand Down
24 changes: 24 additions & 0 deletions sklearn/impute/_iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ..preprocessing import normalize
from ..utils import check_array, check_random_state, _safe_indexing, is_scalar_nan
from ..utils.validation import FLOAT_DTYPES, check_is_fitted
from ..utils.validation import _check_feature_names_in
from ..utils._mask import _get_mask

from ._base import _BaseImputer
Expand Down Expand Up @@ -774,3 +775,26 @@ def fit(self, X, y=None):
"""
self.fit_transform(X)
return self

def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.

Parameters
----------
input_features : array-like of str or None, default=None
Input features.

- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.

Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
input_features = _check_feature_names_in(self, input_features)
names = self.initial_imputer_.get_feature_names_out(input_features)
return self._concatenate_indicator_feature_names_out(names, input_features)
27 changes: 26 additions & 1 deletion sklearn/impute/_knn.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from ..utils import is_scalar_nan
from ..utils._mask import _get_mask
from ..utils.validation import check_is_fitted
from ..utils.validation import _check_feature_names_in


class KNNImputer(_BaseImputer):
Expand Down Expand Up @@ -206,6 +207,7 @@ def fit(self, X, y=None):
_check_weights(self.weights)
self._fit_X = X
self._mask_fit_X = _get_mask(self._fit_X, self.missing_values)
self._valid_mask = ~np.all(self._mask_fit_X, axis=0)

super()._fit_indicator(self._mask_fit_X)

Expand Down Expand Up @@ -242,7 +244,7 @@ def transform(self, X):

mask = _get_mask(X, self.missing_values)
mask_fit_X = self._mask_fit_X
valid_mask = ~np.all(mask_fit_X, axis=0)
valid_mask = self._valid_mask

X_indicator = super()._transform_indicator(mask)

Expand Down Expand Up @@ -327,3 +329,26 @@ def process_chunk(dist_chunk, start):
pass

return super()._concatenate_indicator(X[:, valid_mask], X_indicator)

def get_feature_names_out(self, input_features=None):
"""Get output feature names for transformation.

Parameters
----------
input_features : array-like of str or None, default=None
Input features.

- If `input_features` is `None`, then `feature_names_in_` is
used as feature names in. If `feature_names_in_` is not defined,
then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
- If `input_features` is an array-like, then `input_features` must
match `feature_names_in_` if `feature_names_in_` is defined.

Returns
-------
feature_names_out : ndarray of str objects
Transformed feature names.
"""
input_features = _check_feature_names_in(self, input_features)
names = input_features[self._valid_mask]
return self._concatenate_indicator_feature_names_out(names, input_features)
41 changes: 40 additions & 1 deletion sklearn/impute/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from sklearn.impute import SimpleImputer


IMPUTERS = [IterativeImputer(), KNNImputer(), SimpleImputer()]
IMPUTERS = [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()]
SPARSE_IMPUTERS = [SimpleImputer()]


Expand Down Expand Up @@ -122,3 +122,42 @@ def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
X_trans = imputer.fit_transform(X_df)

assert_allclose(X_trans_expected, X_trans)


@pytest.mark.parametrize("imputer", IMPUTERS, ids=lambda x: x.__class__.__name__)
@pytest.mark.parametrize("add_indicator", [True, False])
def test_imputers_feature_names_out_pandas(imputer, add_indicator):
"""Check feature names out for imputers."""
pd = pytest.importorskip("pandas")
marker = np.nan
imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)

X = np.array(
[
[marker, 1, 5, 3, marker, 1],
[2, marker, 1, 4, marker, 2],
[6, 3, 7, marker, marker, 3],
[1, 2, 9, 8, marker, 4],
]
)
X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
imputer.fit(X_df)

names = imputer.get_feature_names_out()

if add_indicator:
expected_names = [
"a",
"b",
"c",
"d",
"f",
"missingindicator_a",
"missingindicator_b",
"missingindicator_d",
"missingindicator_e",
]
assert_array_equal(expected_names, names)
else:
expected_names = ["a", "b", "c", "d", "f"]
assert_array_equal(expected_names, names)
19 changes: 19 additions & 0 deletions sklearn/impute/tests/test_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -1493,3 +1493,22 @@ def test_most_frequent(expected, array, dtype, extra_value, n_repeat):
assert expected == _most_frequent(
np.array(array, dtype=dtype), extra_value, n_repeat
)


def test_missing_indicator_feature_names_out():
"""Check that missing indicator return the feature names with a prefix."""
pd = pytest.importorskip("pandas")

missing_values = np.nan
X = pd.DataFrame(
[
[missing_values, missing_values, 1, missing_values],
[4, missing_values, 2, 10],
],
columns=["a", "b", "c", "d"],
)

indicator = MissingIndicator(missing_values=missing_values).fit(X)
feature_names = indicator.get_feature_names_out()
expected_names = ["missingindicator_a", "missingindicator_b", "missingindicator_d"]
assert_array_equal(expected_names, feature_names)
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,6 @@ def test_pandas_column_name_consistency(estimator):
"decomposition",
"discriminant_analysis",
"ensemble",
"impute",
"isotonic",
"kernel_approximation",
"preprocessing",
Expand Down