scikit-learn · amueller · Jul 2, 2019 · Jul 2, 2019 · Jul 2, 2019 · Jul 3, 2019
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
@@ -927,7 +927,7 @@ When the change is in a class, we validate and raise warning in ``fit``::
           self.n_clusters = n_clusters
           self.k = k
 
-      def fit(self, X, y):
+      def fit(self, X, y, feature_names_in=None):
           if self.k != 'not_used':
               warnings.warn("'k' was renamed to n_clusters in version 0.13 and "
                             "will be removed in 0.15.", DeprecationWarning)
@@ -983,7 +983,7 @@ When the change is in a class, we validate and raise warning in ``fit``::
       def __init__(self, n_clusters='warn'):
           self.n_clusters = n_clusters
 
-      def fit(self, X, y):
+      def fit(self, X, y, feature_names_in=None):
           if self.n_clusters == 'warn':
             warnings.warn("The default value of n_clusters will change from "
                           "5 to 10 in 0.22.", FutureWarning)
@@ -1339,7 +1339,7 @@ the correct interface more easily.
       ...     def __init__(self, demo_param='demo'):
       ...         self.demo_param = demo_param
       ...
-      ...     def fit(self, X, y):
+      ...     def fit(self, X, y, feature_names_in=None):
       ...
       ...         # Check that X and y have correct shape
       ...         X, y = check_X_y(X, y)

diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
@@ -136,6 +136,32 @@ or by name::
     >>> pipe['reduce_dim']
     PCA()
 
+To enable model inspection, `Pipeline` sets an ``input_features_`` attribute on
+all pipeline steps during fitting. This allows the user to understand how
+features are transformed during a pipeline::
+
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.feature_selection import SelectKBest
+    >>> iris = load_iris()
+    >>> pipe = Pipeline(steps=[
+    ...    ('select', SelectKBest(k=2)),
+    ...    ('clf', LogisticRegression())])
+    >>> pipe.fit(iris.data, iris.target)
+    ... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
+    Pipeline(memory=None,
+              steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))])
+    >>> pipe.named_steps.clf.input_features_
+    array(['x2', 'x3'], dtype='<U2')
+
+You can also provide custom feature names for a more human readable format using
+``get_feature_names``::
+
+    >>> pipe.get_feature_names(iris.feature_names)
+    >>> pipe.named_steps.select.input_features_
+    ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
+    >>> pipe.named_steps.clf.input_features_
+    array(['petal length (cm)', 'petal width (cm)'], dtype='<U17')
+
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py`
@@ -431,7 +457,7 @@ By default, the remaining rating columns are ignored (``remainder='drop'``)::
   >>> from sklearn.feature_extraction.text import CountVectorizer
   >>> from sklearn.preprocessing import OneHotEncoder
   >>> column_trans = ColumnTransformer(
-  ...     [('city_category', OneHotEncoder(dtype='int'),['city']),
+  ...     [('categories', OneHotEncoder(dtype='int'),['city']),
   ...      ('title_bow', CountVectorizer(), 'title')],
   ...     remainder='drop')
 
@@ -441,11 +467,11 @@ By default, the remaining rating columns are ignored (``remainder='drop'``)::
                                   ('title_bow', CountVectorizer(), 'title')])
 
   >>> column_trans.get_feature_names()
-  ['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw',
-  'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his',
-  'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable',
-  'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson',
-  'title_bow__wrath']
+  ['categories__city_London', 'categories__city_Paris',
+   'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast',
+   'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last',
+   'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the',
+   'title_bow__trick', 'title_bow__watson', 'title_bow__wrath']
 
   >>> column_trans.transform(X).toarray()
   array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],

diff --git a/examples/cluster/plot_inductive_clustering.py b/examples/cluster/plot_inductive_clustering.py
@@ -40,7 +40,7 @@ def __init__(self, clusterer, classifier):
         self.clusterer = clusterer
         self.classifier = classifier
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         self.clusterer_ = clone(self.clusterer)
         self.classifier_ = clone(self.classifier)
         y = self.clusterer_.fit_predict(X)

diff --git a/examples/compose/plot_column_transformer.py b/examples/compose/plot_column_transformer.py
@@ -45,7 +45,7 @@
 class TextStats(BaseEstimator, TransformerMixin):
     """Extract features from each document for DictVectorizer"""
 
-    def fit(self, x, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         return self
 
     def transform(self, posts):
@@ -60,7 +60,7 @@ class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
     Takes a sequence of strings and produces a dict of sequences.  Keys are
     `subject` and `body`.
     """
-    def fit(self, x, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         return self
 
     def transform(self, posts):

diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
@@ -68,16 +68,60 @@
 
 # Append classifier to preprocessing pipeline.
 # Now we have a full prediction pipeline.
-clf = Pipeline(steps=[('preprocessor', preprocessor),
-                      ('classifier', LogisticRegression())])
+pipeline = Pipeline(steps=[('preprocessor', preprocessor),
+                    ('classifier', LogisticRegression())])
 
 X = data.drop('survived', axis=1)
 y = data['survived']
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
-clf.fit(X_train, y_train)
-print("model score: %.3f" % clf.score(X_test, y_test))
+pipeline.fit(X_train, y_train)
+print("model score: %.3f" % pipeline.score(X_test, y_test))
+
+
+###############################################################################
+# Inspecting the coefficients values of the classifier
+###############################################################################
+# The coefficients of the final classification step of the pipeline gives an
+# idea how each feature impacts the likelihood of survival assuming that the
+# usual linear model assumptions hold (uncorrelated features, linear
+# separability, homoschedastic errors...) which we do not verify in this
+# example.
+#
+# To get error bars we perform cross-validation and compute the mean and
+# standard deviation for each coefficient accross CV splits. Because we use a
+# standard scaler on the numerical features, the coefficient weights gives us
+# an idea on how much the log odds of surviving are impacted by a change in
+# this dimension contrasted to the mean. Note that the categorical features
+# here are overspecified which makes it slightly harder to interpret because of
+# the information redundancy.
+#
+# We can see that the linear model coefficients are in agreement with the
+# historical reports: people in higher classes and therefore in the upper decks
+# were the first to reach the lifeboats, and often, priority was given to women
+# and children.
+#
+# Note that conditionned on the "pclass_x" one-hot features, the "fare"
+# numerical feature does not seem to be significantly predictive. If we drop
+# the "pclass" feature, then higher "fare" values would appear significantly
+# correlated with a higher likelihood of survival as the "fare" and "pclass"
+# features have a strong statistical dependency.
+
+import matplotlib.pyplot as plt
+from sklearn.model_selection import cross_validate
+from sklearn.model_selection import StratifiedShuffleSplit
+
+cv = StratifiedShuffleSplit(n_splits=20, test_size=0.25, random_state=42)
+cv_results = cross_validate(pipeline, X_train, y_train, cv=cv,
+                            return_estimator=True)
+cv_coefs = np.concatenate([cv_pipeline.named_steps["classifier"].coef_
+                           for cv_pipeline in cv_results["estimator"]])
+fig, ax = plt.subplots()
+ax.barh(pipeline.named_steps["classifier"].feature_names_in_,
+        cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0))
+plt.tight_layout()
+plt.show()
 
 
 ###############################################################################
@@ -96,7 +140,7 @@
     'classifier__C': [0.1, 1.0, 10, 100],
 }
 
-grid_search = GridSearchCV(clf, param_grid, cv=10)
+grid_search = GridSearchCV(pipeline, param_grid, cv=10)
 grid_search.fit(X_train, y_train)
 
 print(("best logistic regression from grid search: %.3f"

diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py
@@ -9,6 +9,7 @@
 Using a sub-pipeline, the fitted coefficients can be mapped back into
 the original feature space.
 """
+import matplotlib.pyplot as plt
 from sklearn import svm
 from sklearn.datasets import samples_generator
 from sklearn.feature_selection import SelectKBest, f_regression
@@ -20,7 +21,7 @@
 
 # import some data to play with
 X, y = samples_generator.make_classification(
-    n_features=20, n_informative=3, n_redundant=0, n_classes=4,
+    n_features=20, n_informative=3, n_redundant=0, n_classes=2,
     n_clusters_per_class=2)
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
@@ -36,5 +37,7 @@
 y_pred = anova_svm.predict(X_test)
 print(classification_report(y_test, y_pred))
 
-coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_)
-print(coef)
+# access and plot the coefficients of the fitted model
+plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel())
+plt.yticks((0, 1, 2), anova_svm[-1].feature_names_in_)
+plt.show()
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -6,15 +6,19 @@
 import copy
 import warnings
 from collections import defaultdict
+
 import platform
 import inspect
 import re
 
 import numpy as np
 
 from . import __version__
+from .exceptions import NotFittedError
+
 from .utils import _IS_32BIT
 
+
 _DEFAULT_TAGS = {
     'non_deterministic': False,
     'requires_positive_X': False,
@@ -558,6 +562,49 @@ def fit_transform(self, X, y=None, **fit_params):
             # fit method of arity 2 (supervised transformation)
             return self.fit(X, y, **fit_params).transform(X)
 
+    def _get_feature_names(self, input_features=None):
+        """Get output feature names.
+
+        Parameters
+        ----------
+        input_features : list of string or None
+            String names of the input features.
+
+        Returns
+        -------
+        output_feature_names : list of string
+            Feature names for transformer output.
+        """
+        # OneToOneMixin is higher in the class hierarchy
+        # because we put mixins on the wrong side
+        if hasattr(super(), 'get_feature_names'):
+            return super().get_feature_names(input_features)
+        # generate feature names from class name by default
+        # would be much less guessing if we stored the number
+        # of output features.
+        # Ideally this would be done in each class.
+        if hasattr(self, 'n_clusters'):
+            # this is before n_components_
+            # because n_components_ means something else
+            # in agglomerative clustering
+            n_features = self.n_clusters
+        elif hasattr(self, '_max_components'):
+            # special case for LinearDiscriminantAnalysis
+            n_components = self.n_components or np.inf
+            n_features = min(self._max_components, n_components)
+        elif hasattr(self, 'n_components_'):
+            # n_components could be auto or None
+            # this is more likely to be an int
+            n_features = self.n_components_
+        elif hasattr(self, 'n_components') and self.n_components is not None:
+            n_features = self.n_components
+        elif hasattr(self, 'components_'):
+            n_features = self.components_.shape[0]
+        else:
+            return None
+        return ["{}{}".format(type(self).__name__.lower(), i)
+                for i in range(n_features)]
+
 
 class DensityMixin:
     """Mixin class for all density estimators in scikit-learn."""
@@ -603,6 +650,18 @@ def fit_predict(self, X, y=None):
         return self.fit(X).predict(X)
 
 
+class OneToOneMixin(object):
+    """Provides get_feature_names for simple transformers
+
+    Assumes there's a 1-to-1 correspondence between input features
+    and output features.
+    """
+
+    @property
+    def feature_names_out_(self):
+        return self.feature_names_in_
+
+
 class MetaEstimatorMixin:
     _required_parameters = ["estimator"]
     """Mixin class for all meta estimators in scikit-learn."""

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -109,7 +109,7 @@ def __init__(self, base_estimator=None, method='sigmoid', cv=None):
         self.method = method
         self.cv = cv
 
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, feature_names_in=None):
         """Fit the calibrated model
 
         Parameters
@@ -312,7 +312,7 @@ def _preproc(self, X):
 
         return df, idx_pos_class
 
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, feature_names_in=None):
         """Calibrate the fitted model
 
         Parameters
@@ -474,7 +474,7 @@ class _SigmoidCalibration(BaseEstimator, RegressorMixin):
     b_ : float
         The intercept.
     """
-    def fit(self, X, y, sample_weight=None):
+    def fit(self, X, y, sample_weight=None, feature_names_in=None):
         """Fit the model using X, y as training data.
 
         Parameters

diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py
@@ -349,7 +349,7 @@ def __init__(self, damping=.5, max_iter=200, convergence_iter=15,
     def _pairwise(self):
         return self.affinity == "precomputed"
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         """Fit the clustering from features, or affinity matrix.
 
         Parameters

diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py
@@ -107,7 +107,7 @@ def _check_parameters(self):
                              " one of {1}.".format(self.svd_method,
                                                    legal_svd_methods))
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         """Creates a biclustering for X.
 
         Parameters

diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py
@@ -429,7 +429,7 @@ def __init__(self, threshold=0.5, branching_factor=50, n_clusters=3,
         self.compute_labels = compute_labels
         self.copy = copy
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         """
         Build a CF Tree for the input data.
 

diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
@@ -774,7 +774,7 @@ def __init__(self, n_clusters=2, affinity="euclidean",
     def n_components_(self):
         return self.n_connected_components_
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         """Fit the hierarchical clustering from features, or distance matrix.
 
         Parameters

diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
@@ -403,7 +403,7 @@ def __init__(self, bandwidth=None, seeds=None, bin_seeding=False,
         self.min_bin_freq = min_bin_freq
         self.n_jobs = n_jobs
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         """Perform clustering.
 
         Parameters

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
@@ -212,7 +212,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2,
         self.predecessor_correction = predecessor_correction
         self.n_jobs = n_jobs
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         """Perform OPTICS clustering
 
         Extracts an ordered list of points and reachability distances, and

diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
@@ -445,7 +445,7 @@ def __init__(self, n_clusters=8, eigen_solver=None, n_components=None,
         self.kernel_params = kernel_params
         self.n_jobs = n_jobs
 
-    def fit(self, X, y=None):
+    def fit(self, X, y=None, feature_names_in=None):
         """Perform spectral clustering from features, or affinity matrix.
 
         Parameters