Skip to content

RFC/WIP Feature names within fit #14238

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions doc/developers/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,7 @@ When the change is in a class, we validate and raise warning in ``fit``::
self.n_clusters = n_clusters
self.k = k

def fit(self, X, y):
def fit(self, X, y, feature_names_in=None):
if self.k != 'not_used':
warnings.warn("'k' was renamed to n_clusters in version 0.13 and "
"will be removed in 0.15.", DeprecationWarning)
Expand Down Expand Up @@ -983,7 +983,7 @@ When the change is in a class, we validate and raise warning in ``fit``::
def __init__(self, n_clusters='warn'):
self.n_clusters = n_clusters

def fit(self, X, y):
def fit(self, X, y, feature_names_in=None):
if self.n_clusters == 'warn':
warnings.warn("The default value of n_clusters will change from "
"5 to 10 in 0.22.", FutureWarning)
Expand Down Expand Up @@ -1339,7 +1339,7 @@ the correct interface more easily.
... def __init__(self, demo_param='demo'):
... self.demo_param = demo_param
...
... def fit(self, X, y):
... def fit(self, X, y, feature_names_in=None):
...
... # Check that X and y have correct shape
... X, y = check_X_y(X, y)
Expand Down
38 changes: 32 additions & 6 deletions doc/modules/compose.rst
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,32 @@ or by name::
>>> pipe['reduce_dim']
PCA()

To enable model inspection, `Pipeline` sets an ``input_features_`` attribute on
all pipeline steps during fitting. This allows the user to understand how
features are transformed during a pipeline::

>>> from sklearn.datasets import load_iris
>>> from sklearn.feature_selection import SelectKBest
>>> iris = load_iris()
>>> pipe = Pipeline(steps=[
... ('select', SelectKBest(k=2)),
... ('clf', LogisticRegression())])
>>> pipe.fit(iris.data, iris.target)
... # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
Pipeline(memory=None,
steps=[('select', SelectKBest(...)), ('clf', LogisticRegression(...))])
>>> pipe.named_steps.clf.input_features_
array(['x2', 'x3'], dtype='<U2')

You can also provide custom feature names for a more human readable format using
``get_feature_names``::

>>> pipe.get_feature_names(iris.feature_names)
>>> pipe.named_steps.select.input_features_
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
>>> pipe.named_steps.clf.input_features_
array(['petal length (cm)', 'petal width (cm)'], dtype='<U17')

.. topic:: Examples:

* :ref:`sphx_glr_auto_examples_feature_selection_plot_feature_selection_pipeline.py`
Expand Down Expand Up @@ -431,7 +457,7 @@ By default, the remaining rating columns are ignored (``remainder='drop'``)::
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> from sklearn.preprocessing import OneHotEncoder
>>> column_trans = ColumnTransformer(
... [('city_category', OneHotEncoder(dtype='int'),['city']),
... [('categories', OneHotEncoder(dtype='int'),['city']),
... ('title_bow', CountVectorizer(), 'title')],
... remainder='drop')

Expand All @@ -441,11 +467,11 @@ By default, the remaining rating columns are ignored (``remainder='drop'``)::
('title_bow', CountVectorizer(), 'title')])

>>> column_trans.get_feature_names()
['city_category__x0_London', 'city_category__x0_Paris', 'city_category__x0_Sallisaw',
'title_bow__bow', 'title_bow__feast', 'title_bow__grapes', 'title_bow__his',
'title_bow__how', 'title_bow__last', 'title_bow__learned', 'title_bow__moveable',
'title_bow__of', 'title_bow__the', 'title_bow__trick', 'title_bow__watson',
'title_bow__wrath']
['categories__city_London', 'categories__city_Paris',
'categories__city_Sallisaw', 'title_bow__bow', 'title_bow__feast',
'title_bow__grapes', 'title_bow__his', 'title_bow__how', 'title_bow__last',
'title_bow__learned', 'title_bow__moveable', 'title_bow__of', 'title_bow__the',
'title_bow__trick', 'title_bow__watson', 'title_bow__wrath']

>>> column_trans.transform(X).toarray()
array([[1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
Expand Down
2 changes: 1 addition & 1 deletion examples/cluster/plot_inductive_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, clusterer, classifier):
self.clusterer = clusterer
self.classifier = classifier

def fit(self, X, y=None):
def fit(self, X, y=None, feature_names_in=None):
self.clusterer_ = clone(self.clusterer)
self.classifier_ = clone(self.classifier)
y = self.clusterer_.fit_predict(X)
Expand Down
4 changes: 2 additions & 2 deletions examples/compose/plot_column_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""

def fit(self, x, y=None):
def fit(self, X, y=None, feature_names_in=None):
return self

def transform(self, posts):
Expand All @@ -60,7 +60,7 @@ class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
Takes a sequence of strings and produces a dict of sequences. Keys are
`subject` and `body`.
"""
def fit(self, x, y=None):
def fit(self, X, y=None, feature_names_in=None):
return self

def transform(self, posts):
Expand Down
54 changes: 49 additions & 5 deletions examples/compose/plot_column_transformer_mixed_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,60 @@

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', LogisticRegression())])

X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
pipeline.fit(X_train, y_train)
print("model score: %.3f" % pipeline.score(X_test, y_test))


###############################################################################
# Inspecting the coefficients values of the classifier
###############################################################################
# The coefficients of the final classification step of the pipeline gives an
# idea how each feature impacts the likelihood of survival assuming that the
# usual linear model assumptions hold (uncorrelated features, linear
# separability, homoschedastic errors...) which we do not verify in this
# example.
#
# To get error bars we perform cross-validation and compute the mean and
# standard deviation for each coefficient accross CV splits. Because we use a
# standard scaler on the numerical features, the coefficient weights gives us
# an idea on how much the log odds of surviving are impacted by a change in
# this dimension contrasted to the mean. Note that the categorical features
# here are overspecified which makes it slightly harder to interpret because of
# the information redundancy.
#
# We can see that the linear model coefficients are in agreement with the
# historical reports: people in higher classes and therefore in the upper decks
# were the first to reach the lifeboats, and often, priority was given to women
# and children.
#
# Note that conditionned on the "pclass_x" one-hot features, the "fare"
# numerical feature does not seem to be significantly predictive. If we drop
# the "pclass" feature, then higher "fare" values would appear significantly
# correlated with a higher likelihood of survival as the "fare" and "pclass"
# features have a strong statistical dependency.

import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedShuffleSplit

cv = StratifiedShuffleSplit(n_splits=20, test_size=0.25, random_state=42)
cv_results = cross_validate(pipeline, X_train, y_train, cv=cv,
return_estimator=True)
cv_coefs = np.concatenate([cv_pipeline.named_steps["classifier"].coef_
for cv_pipeline in cv_results["estimator"]])
fig, ax = plt.subplots()
ax.barh(pipeline.named_steps["classifier"].feature_names_in_,
cv_coefs.mean(axis=0), xerr=cv_coefs.std(axis=0))
plt.tight_layout()
plt.show()


###############################################################################
Expand All @@ -96,7 +140,7 @@
'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search = GridSearchCV(pipeline, param_grid, cv=10)
grid_search.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
Expand Down
9 changes: 6 additions & 3 deletions examples/feature_selection/plot_feature_selection_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
Using a sub-pipeline, the fitted coefficients can be mapped back into
the original feature space.
"""
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.datasets import samples_generator
from sklearn.feature_selection import SelectKBest, f_regression
Expand All @@ -20,7 +21,7 @@

# import some data to play with
X, y = samples_generator.make_classification(
n_features=20, n_informative=3, n_redundant=0, n_classes=4,
n_features=20, n_informative=3, n_redundant=0, n_classes=2,
n_clusters_per_class=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
Expand All @@ -36,5 +37,7 @@
y_pred = anova_svm.predict(X_test)
print(classification_report(y_test, y_pred))

coef = anova_svm[:-1].inverse_transform(anova_svm['linearsvc'].coef_)
print(coef)
# access and plot the coefficients of the fitted model
plt.barh((0, 1, 2), anova_svm[-1].coef_.ravel())
plt.yticks((0, 1, 2), anova_svm[-1].feature_names_in_)
plt.show()
59 changes: 59 additions & 0 deletions sklearn/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,19 @@
import copy
import warnings
from collections import defaultdict

import platform
import inspect
import re

import numpy as np

from . import __version__
from .exceptions import NotFittedError

from .utils import _IS_32BIT


_DEFAULT_TAGS = {
'non_deterministic': False,
'requires_positive_X': False,
Expand Down Expand Up @@ -558,6 +562,49 @@ def fit_transform(self, X, y=None, **fit_params):
# fit method of arity 2 (supervised transformation)
return self.fit(X, y, **fit_params).transform(X)

def _get_feature_names(self, input_features=None):
"""Get output feature names.

Parameters
----------
input_features : list of string or None
String names of the input features.

Returns
-------
output_feature_names : list of string
Feature names for transformer output.
"""
# OneToOneMixin is higher in the class hierarchy
# because we put mixins on the wrong side
if hasattr(super(), 'get_feature_names'):
return super().get_feature_names(input_features)
# generate feature names from class name by default
# would be much less guessing if we stored the number
# of output features.
# Ideally this would be done in each class.
if hasattr(self, 'n_clusters'):
# this is before n_components_
# because n_components_ means something else
# in agglomerative clustering
n_features = self.n_clusters
elif hasattr(self, '_max_components'):
# special case for LinearDiscriminantAnalysis
n_components = self.n_components or np.inf
n_features = min(self._max_components, n_components)
elif hasattr(self, 'n_components_'):
# n_components could be auto or None
# this is more likely to be an int
n_features = self.n_components_
elif hasattr(self, 'n_components') and self.n_components is not None:
n_features = self.n_components
elif hasattr(self, 'components_'):
n_features = self.components_.shape[0]
else:
return None
return ["{}{}".format(type(self).__name__.lower(), i)
for i in range(n_features)]


class DensityMixin:
"""Mixin class for all density estimators in scikit-learn."""
Expand Down Expand Up @@ -603,6 +650,18 @@ def fit_predict(self, X, y=None):
return self.fit(X).predict(X)


class OneToOneMixin(object):
"""Provides get_feature_names for simple transformers

Assumes there's a 1-to-1 correspondence between input features
and output features.
"""

@property
def feature_names_out_(self):
return self.feature_names_in_


class MetaEstimatorMixin:
_required_parameters = ["estimator"]
"""Mixin class for all meta estimators in scikit-learn."""
Expand Down
6 changes: 3 additions & 3 deletions sklearn/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def __init__(self, base_estimator=None, method='sigmoid', cv=None):
self.method = method
self.cv = cv

def fit(self, X, y, sample_weight=None):
def fit(self, X, y, sample_weight=None, feature_names_in=None):
"""Fit the calibrated model

Parameters
Expand Down Expand Up @@ -312,7 +312,7 @@ def _preproc(self, X):

return df, idx_pos_class

def fit(self, X, y, sample_weight=None):
def fit(self, X, y, sample_weight=None, feature_names_in=None):
"""Calibrate the fitted model

Parameters
Expand Down Expand Up @@ -474,7 +474,7 @@ class _SigmoidCalibration(BaseEstimator, RegressorMixin):
b_ : float
The intercept.
"""
def fit(self, X, y, sample_weight=None):
def fit(self, X, y, sample_weight=None, feature_names_in=None):
"""Fit the model using X, y as training data.

Parameters
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/affinity_propagation_.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def __init__(self, damping=.5, max_iter=200, convergence_iter=15,
def _pairwise(self):
return self.affinity == "precomputed"

def fit(self, X, y=None):
def fit(self, X, y=None, feature_names_in=None):
"""Fit the clustering from features, or affinity matrix.

Parameters
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/bicluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _check_parameters(self):
" one of {1}.".format(self.svd_method,
legal_svd_methods))

def fit(self, X, y=None):
def fit(self, X, y=None, feature_names_in=None):
"""Creates a biclustering for X.

Parameters
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/birch.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ def __init__(self, threshold=0.5, branching_factor=50, n_clusters=3,
self.compute_labels = compute_labels
self.copy = copy

def fit(self, X, y=None):
def fit(self, X, y=None, feature_names_in=None):
"""
Build a CF Tree for the input data.

Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/hierarchical.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,7 +774,7 @@ def __init__(self, n_clusters=2, affinity="euclidean",
def n_components_(self):
return self.n_connected_components_

def fit(self, X, y=None):
def fit(self, X, y=None, feature_names_in=None):
"""Fit the hierarchical clustering from features, or distance matrix.

Parameters
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/mean_shift_.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def __init__(self, bandwidth=None, seeds=None, bin_seeding=False,
self.min_bin_freq = min_bin_freq
self.n_jobs = n_jobs

def fit(self, X, y=None):
def fit(self, X, y=None, feature_names_in=None):
"""Perform clustering.

Parameters
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/optics_.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def __init__(self, min_samples=5, max_eps=np.inf, metric='minkowski', p=2,
self.predecessor_correction = predecessor_correction
self.n_jobs = n_jobs

def fit(self, X, y=None):
def fit(self, X, y=None, feature_names_in=None):
"""Perform OPTICS clustering

Extracts an ordered list of points and reachability distances, and
Expand Down
2 changes: 1 addition & 1 deletion sklearn/cluster/spectral.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ def __init__(self, n_clusters=8, eigen_solver=None, n_components=None,
self.kernel_params = kernel_params
self.n_jobs = n_jobs

def fit(self, X, y=None):
def fit(self, X, y=None, feature_names_in=None):
"""Perform spectral clustering from features, or affinity matrix.

Parameters
Expand Down
Loading