From 0476ca857549bf24d0dad4f7e7c6b05e26a3af17 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 21 Dec 2022 17:10:05 -0600 Subject: [PATCH 01/10] modified: sklearn/pipeline.py - added self._check_feature_names(...) to the .fit(...) method in FeatureUnion to allow access to the `.feature_names_in_` attribute if `X` has features names, e.g. a pandas.DataFrame - updated FeatureUnion docstring to reflect the addition of .feature_names_in_ attribute modified: sklearn/tests/test_pipeline.py - added test_feature_union_feature_names_in_() to test that FeatureUnion has a `.feature_names_in_` attribute if fitted with a pandas.DataFrame and not if fitted with a numpy array --- sklearn/pipeline.py | 7 +++++++ sklearn/tests/test_pipeline.py | 21 +++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 5236c4499a728..7930732f886a9 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -995,6 +995,12 @@ class FeatureUnion(TransformerMixin, _BaseComposition): .. versionadded:: 0.24 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when + `X` has feature names that are all strings. + + .. versionadded:: 1.3 + See Also -------- make_union : Convenience function for simplified feature union @@ -1182,6 +1188,7 @@ def fit(self, X, y=None, **fit_params): self : object FeatureUnion class instance. """ + self._check_feature_names(X, reset=True) transformers = self._parallel_func(X, y, fit_params, _fit_one) if not transformers: # All transformers are None diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 342dc12b966c9..7b6384d7521ef 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1647,3 +1647,24 @@ def test_feature_union_getitem_error(key): msg = "Only string keys are supported" with pytest.raises(KeyError, match=msg): union[key] + + +def test_feature_union_feature_names_in_(): + """Ensure feature union has `.feature_names_in_` attribute if input + is pandas.DataFrame. + + Test for #24754""" + X, _ = load_iris(as_frame=True, return_X_y=True) + X_train, X_test = train_test_split(X, random_state=0) + + # fit with pandas.DataFrame + union = FeatureUnion([("pass", "passthrough")]) + union.fit(X_train) + assert hasattr(union, "feature_names_in_") + assert_array_equal(X_train.columns, union.feature_names_in_) + + # fit with numpy array + X_array = X_train.to_numpy() + union = FeatureUnion([("pass", "passthrough")]) + union.fit(X_array) + assert not hasattr(union, "feature_names_in_") From 58c907ace34ab61767c38b6b41546ab231deee91 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 21 Dec 2022 18:09:26 -0600 Subject: [PATCH 02/10] modified: doc/whats_new/v1.3.rst - changelog updated with description of work --- doc/whats_new/v1.3.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 68a569acb14e5..c6b3e93509d5b 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -41,6 +41,10 @@ Changelog - |Feature| :class:`pipeline.FeatureUnion` can now use indexing notation (e.g. `feature_union["scalar"]`) to access transformers by name. :pr:`25093` by `Thomas Fan`_. +- |Feature| :class:`pipeline.FeatureUnion` can now access the + `feature_names_in_` attribute if the value seen during `.fit` was a `pandas + .DataFrame`. + :pr:`25220` by :user:`Ian Thompson `. :mod:`sklearn.preprocessing` ............................ From cebfa4a4147074682e82b5b91ddd810212e0dbae Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Thu, 22 Dec 2022 09:32:39 -0600 Subject: [PATCH 03/10] modified: doc/whats_new/v1.3.rst - made changelog description more precise --- doc/whats_new/v1.3.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index c6b3e93509d5b..fc69d2c88e574 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -42,8 +42,8 @@ Changelog `feature_union["scalar"]`) to access transformers by name. :pr:`25093` by `Thomas Fan`_. - |Feature| :class:`pipeline.FeatureUnion` can now access the - `feature_names_in_` attribute if the value seen during `.fit` was a `pandas - .DataFrame`. + `feature_names_in_` attribute if the `X` value seen during `.fit` has a + `.columns` attribute and all columns are strings. :pr:`25220` by :user:`Ian Thompson `. :mod:`sklearn.preprocessing` From e60a2d9f1ec41e3d85342ac35fe2ef3278d517f3 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Thu, 22 Dec 2022 09:36:19 -0600 Subject: [PATCH 04/10] modified: doc/whats_new/v1.3.rst - typo -- removed period (.) before `columns` --- doc/whats_new/v1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index fc69d2c88e574..6a9c3632e0124 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -43,7 +43,7 @@ Changelog `Thomas Fan`_. - |Feature| :class:`pipeline.FeatureUnion` can now access the `feature_names_in_` attribute if the `X` value seen during `.fit` has a - `.columns` attribute and all columns are strings. + `columns` attribute and all columns are strings. :pr:`25220` by :user:`Ian Thompson `. :mod:`sklearn.preprocessing` From 4fcfac9dad0e719b9b438c814382b779731243c8 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Thu, 22 Dec 2022 12:06:42 -0600 Subject: [PATCH 05/10] modified: sklearn/pipeline.py - removed `self._check_feature_names(...) from `.fit(...)` method in `FeatureUnion` - added `feature_names_in_()` property to `FeatureUnion` to use first transformer's `feature_names_in_` attribute if present modified: sklearn/tests/test_pipeline.py - updated docstring for `test_feature_union_feature_names_in_()` to be more precise - added additional assertions to check if the `feature_names_in_` attribute is available to `FeatureUnion` if it's instantiated with a transformer that has already been fit --- sklearn/pipeline.py | 7 ++++++- sklearn/tests/test_pipeline.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 7930732f886a9..5107064f73576 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -1188,7 +1188,6 @@ def fit(self, X, y=None, **fit_params): self : object FeatureUnion class instance. """ - self._check_feature_names(X, reset=True) transformers = self._parallel_func(X, y, fit_params, _fit_one) if not transformers: # All transformers are None @@ -1304,6 +1303,12 @@ def n_features_in_(self): # X is passed to all transformers so we just delegate to the first one return self.transformer_list[0][1].n_features_in_ + @property + def feature_names_in_(self): + """Names of features seen during :term:`fit`.""" + # X is passed to all transformers -- delegate to the first one + return self.transformer_list[0][1].feature_names_in_ + def __sklearn_is_fitted__(self): # Delegate whether feature union was fitted for _, transformer, _ in self._iter(): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 7b6384d7521ef..f430f7e192800 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1650,13 +1650,22 @@ def test_feature_union_getitem_error(key): def test_feature_union_feature_names_in_(): - """Ensure feature union has `.feature_names_in_` attribute if input - is pandas.DataFrame. + """Ensure feature union has `.feature_names_in_` attribute if `X` has a + `columns` attribute. Test for #24754""" X, _ = load_iris(as_frame=True, return_X_y=True) X_train, X_test = train_test_split(X, random_state=0) + # FeatureUnion should have the feature_names_in_ attribute if the + # first transformer also has it + scaler = StandardScaler() + scaler.fit(X_train) + union = FeatureUnion([("scale", scaler)]) + assert hasattr(union, "feature_names_in_") + assert_array_equal(X_train.columns, union.feature_names_in_) + assert_array_equal(scaler.feature_names_in_, union.feature_names_in_) + # fit with pandas.DataFrame union = FeatureUnion([("pass", "passthrough")]) union.fit(X_train) From 62c5267d2ccbad586339dd0820203e08b80be869 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Thu, 22 Dec 2022 13:05:38 -0600 Subject: [PATCH 06/10] modified: doc/whats_new/v1.3.rst - updated changelog description to include `pandas.DataFrame` - corrected user signature to match github account --- doc/whats_new/v1.3.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 9886155ac947d..ad773ce924cd4 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -55,8 +55,9 @@ Changelog `Thomas Fan`_. - |Feature| :class:`pipeline.FeatureUnion` can now access the `feature_names_in_` attribute if the `X` value seen during `.fit` has a - `columns` attribute and all columns are strings. - :pr:`25220` by :user:`Ian Thompson `. + `columns` attribute and all columns are strings. e.g. when `X` is a + `pandas.DataFrame` + :pr:`25220` by :user:`Ian Thompson `. :mod:`sklearn.preprocessing` ............................ From d73327adc02c64f7dcc6e271aacd106468204830 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Wed, 28 Dec 2022 13:11:11 -0600 Subject: [PATCH 07/10] modified: sklearn/tests/test_pipeline.py - added pandas import to `test_feature_union_feature_names_in_` so ImportError in azure-pipelines will pass --- sklearn/tests/test_pipeline.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index f430f7e192800..dbebe1f0c892b 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1654,6 +1654,8 @@ def test_feature_union_feature_names_in_(): `columns` attribute. Test for #24754""" + pytest.importorskip("pandas") + X, _ = load_iris(as_frame=True, return_X_y=True) X_train, X_test = train_test_split(X, random_state=0) From 1ec500063ea1e6e9b593ab3e3aac993e5e843dca Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Tue, 3 Jan 2023 08:56:27 -0600 Subject: [PATCH 08/10] Update doc/whats_new/v1.3.rst newline/whitespace between change log updates. Co-authored-by: Thomas J. Fan --- doc/whats_new/v1.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index 69ed2d88813a7..85dbd9378d558 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -91,6 +91,7 @@ Changelog - |Feature| :class:`pipeline.FeatureUnion` can now use indexing notation (e.g. `feature_union["scalar"]`) to access transformers by name. :pr:`25093` by `Thomas Fan`_. + - |Feature| :class:`pipeline.FeatureUnion` can now access the `feature_names_in_` attribute if the `X` value seen during `.fit` has a `columns` attribute and all columns are strings. e.g. when `X` is a From af78798e080d31f7932aeef59d2432be334e19d1 Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Tue, 3 Jan 2023 08:57:16 -0600 Subject: [PATCH 09/10] Update sklearn/tests/test_pipeline.py added period at end of docstring Co-authored-by: Thomas J. Fan --- sklearn/tests/test_pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index dbebe1f0c892b..aab775f475342 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1653,7 +1653,8 @@ def test_feature_union_feature_names_in_(): """Ensure feature union has `.feature_names_in_` attribute if `X` has a `columns` attribute. - Test for #24754""" + Test for #24754. + """ pytest.importorskip("pandas") X, _ = load_iris(as_frame=True, return_X_y=True) From 073daf7a37dfb69c79ba4fba692be5544a2cdaba Mon Sep 17 00:00:00 2001 From: Ian Thompson Date: Tue, 3 Jan 2023 09:37:13 -0600 Subject: [PATCH 10/10] modified: sklearn/tests/test_pipeline.py - removed train-test-split per code suggestion -- using `X` directly --- sklearn/tests/test_pipeline.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index aab775f475342..fa7fa2ad20dcf 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1658,25 +1658,24 @@ def test_feature_union_feature_names_in_(): pytest.importorskip("pandas") X, _ = load_iris(as_frame=True, return_X_y=True) - X_train, X_test = train_test_split(X, random_state=0) # FeatureUnion should have the feature_names_in_ attribute if the # first transformer also has it scaler = StandardScaler() - scaler.fit(X_train) + scaler.fit(X) union = FeatureUnion([("scale", scaler)]) assert hasattr(union, "feature_names_in_") - assert_array_equal(X_train.columns, union.feature_names_in_) + assert_array_equal(X.columns, union.feature_names_in_) assert_array_equal(scaler.feature_names_in_, union.feature_names_in_) # fit with pandas.DataFrame union = FeatureUnion([("pass", "passthrough")]) - union.fit(X_train) + union.fit(X) assert hasattr(union, "feature_names_in_") - assert_array_equal(X_train.columns, union.feature_names_in_) + assert_array_equal(X.columns, union.feature_names_in_) # fit with numpy array - X_array = X_train.to_numpy() + X_array = X.to_numpy() union = FeatureUnion([("pass", "passthrough")]) union.fit(X_array) assert not hasattr(union, "feature_names_in_")