scikit-learn · NicolasHug · Feb 29, 2020 · Apr 9, 2019 · Apr 19, 2019 · Apr 19, 2019
diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
@@ -226,6 +226,17 @@ the dataset, e.g. when ``X`` is a precomputed kernel matrix. Specifically,
 the :term:`_pairwise` property is used by ``utils.metaestimators._safe_split``
 to slice rows and columns.
 
+Universal attributes
+^^^^^^^^^^^^^^^^^^^^
+
+Estimators that expect tabular input should set a `n_features_in_`
+attribute at `fit` time to indicate the number of features that the estimator
+expects for subsequent calls to `predict` or `transform`.
+See
+`SLEP010
+<https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
+for details.
+
 .. _rolling_your_own_estimator:
 
 Rolling your own estimator

diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -14,7 +14,6 @@ Version 0.23.0
 
 Put the changes in their relevant module.
 
-
 Changed models
 --------------
 
@@ -378,3 +377,12 @@ Changelog
 - |Fix| :class:`cluster.AgglomerativeClustering` add specific error when
   distance matrix is not square and `affinity=precomputed`.
   :pr:`16257` by :user:`Simona Maggio <simonamaggio>`.
+
+Miscellaneous
+.............
+
+- |API| Most estimators now expose a `n_features_in_` attribute. This
+  attribute is equal to the number of features passed to the `fit` method.
+  See `SLEP010
+  <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
+  for details. :pr:`16112` by `Nicolas Hug`_.
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -18,6 +18,8 @@
 
 from . import __version__
 from .utils import _IS_32BIT
+from .utils.validation import check_X_y
+from .utils.validation import check_array
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -343,6 +345,72 @@ def _get_tags(self):
                 collected_tags.update(more_tags)
         return collected_tags
 
+    def _check_n_features(self, X, reset):
+        """Set the `n_features_in_` attribute, or check against it.
+
+        Parameters
+        ----------
+        X : {ndarray, sparse matrix} of shape (n_samples, n_features)
+            The input samples.
+        reset : bool
+            If True, the `n_features_in_` attribute is set to `X.shape[1]`.
+            Else, the attribute must already exist and the function checks
+            that it is equal to `X.shape[1]`.
+        """
+        n_features = X.shape[1]
+
+        if reset:
+            self.n_features_in_ = n_features
+        else:
+            if not hasattr(self, 'n_features_in_'):
+                raise RuntimeError(
+                    "The reset parameter is False but there is no "
+                    "n_features_in_ attribute. Is this estimator fitted?"
+                )
+            if n_features != self.n_features_in_:
+                raise ValueError(
+                    'X has {} features, but this {} is expecting {} features '
+                    'as input.'.format(n_features, self.__class__.__name__,
+                                       self.n_features_in_)
+                )
+
+    def _validate_data(self, X, y=None, reset=True, **check_params):
+        """Validate input data and set or check the `n_features_in_` attribute.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix, dataframe} of shape \
+                (n_samples, n_features)
+            The input samples.
+        y : array-like of shape (n_samples,), default=None
+            The targets. If None, `check_array` is called on `X` and
+            `check_X_y` is called otherwise.
+        reset : bool, default=True
+            Whether to reset the `n_features_in_` attribute.
+            If False, the input will be checked for consistency with data
+            provided when reset was last True.
+        **check_params : kwargs
+            Parameters passed to :func:`sklearn.utils.check_array` or
+            :func:`sklearn.utils.check_X_y`.
+
+        Returns
+        -------
+        out : {ndarray, sparse matrix} or tuple of these
+            The validated input. A tuple is returned if `y` is not None.
+        """
+
+        if y is None:
+            X = check_array(X, **check_params)
+            out = X
+        else:
+            X, y = check_X_y(X, y, **check_params)
+            out = X, y
+
+        if check_params.get('ensure_2d', True):
+            self._check_n_features(X, reset=reset)
+
+        return out
+
 
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -124,8 +124,8 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns an instance of self.
         """
-        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
-                         force_all_finite=False, allow_nd=True)
+        X, y = self._validate_data(X, y, accept_sparse=['csc', 'csr', 'coo'],
+                                   force_all_finite=False, allow_nd=True)
         X, y = indexable(X, y)
         le = LabelBinarizer().fit(y)
         self.classes_ = le.classes_

diff --git a/sklearn/cluster/_affinity_propagation.py b/sklearn/cluster/_affinity_propagation.py
@@ -374,7 +374,7 @@ def fit(self, X, y=None):
             accept_sparse = False
         else:
             accept_sparse = 'csr'
-        X = check_array(X, accept_sparse=accept_sparse)
+        X = self._validate_data(X, accept_sparse=accept_sparse)
         if self.affinity == "precomputed":
             self.affinity_matrix_ = X
         elif self.affinity == "euclidean":

diff --git a/sklearn/cluster/_agglomerative.py b/sklearn/cluster/_agglomerative.py
@@ -809,7 +809,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X, ensure_min_samples=2, estimator=self)
+        X = self._validate_data(X, ensure_min_samples=2, estimator=self)
         memory = check_memory(self.memory)
 
         if self.n_clusters is not None and self.n_clusters <= 0:
@@ -1055,9 +1055,14 @@ def fit(self, X, y=None, **params):
         -------
         self
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        ensure_min_features=2, estimator=self)
-        return AgglomerativeClustering.fit(self, X.T, **params)
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                ensure_min_features=2, estimator=self)
+        # save n_features_in_ attribute here to reset it after, because it will
+        # be overridden in AgglomerativeClustering since we passed it X.T.
+        n_features_in_ = self.n_features_in_
+        AgglomerativeClustering.fit(self, X.T, **params)
+        self.n_features_in_ = n_features_in_
+        return self
 
     @property
     def fit_predict(self):

diff --git a/sklearn/cluster/_bicluster.py b/sklearn/cluster/_bicluster.py
@@ -121,7 +121,7 @@ def fit(self, X, y=None):
             warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
                           " removed in 0.25.", FutureWarning)
 
-        X = check_array(X, accept_sparse='csr', dtype=np.float64)
+        X = self._validate_data(X, accept_sparse='csr', dtype=np.float64)
         self._check_parameters()
         self._fit(X)
         return self

diff --git a/sklearn/cluster/_birch.py b/sklearn/cluster/_birch.py
@@ -463,7 +463,7 @@ def fit(self, X, y=None):
         return self._fit(X)
 
     def _fit(self, X):
-        X = check_array(X, accept_sparse='csr', copy=self.copy)
+        X = self._validate_data(X, accept_sparse='csr', copy=self.copy)
         threshold = self.threshold
         branching_factor = self.branching_factor
 

diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py
@@ -308,7 +308,7 @@ def fit(self, X, y=None, sample_weight=None):
         self
 
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr')
 
         if not self.eps > 0.0:
             raise ValueError("eps must be positive.")

diff --git a/sklearn/cluster/_kmeans.py b/sklearn/cluster/_kmeans.py
@@ -975,8 +975,10 @@ def fit(self, X, y=None, sample_weight=None):
                 ' got %d instead' % self.max_iter
             )
 
-        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
-                        order='C', copy=self.copy_x, accept_large_sparse=False)
+        X = self._validate_data(X, accept_sparse='csr',
+                                dtype=[np.float64, np.float32],
+                                order='C', copy=self.copy_x,
+                                accept_large_sparse=False)
         # verify that the number of samples given is larger than k
         if _num_samples(X) < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
@@ -1591,8 +1593,8 @@ def fit(self, X, y=None, sample_weight=None):
         self
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X, accept_sparse="csr", order='C',
-                        dtype=[np.float64, np.float32])
+        X = self._validate_data(X, accept_sparse="csr", order='C',
+                                dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
         if n_samples < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d"

diff --git a/sklearn/cluster/_mean_shift.py b/sklearn/cluster/_mean_shift.py
@@ -368,7 +368,7 @@ def fit(self, X, y=None):
         y : Ignored
 
         """
-        X = check_array(X)
+        X = self._validate_data(X)
         bandwidth = self.bandwidth
         if bandwidth is None:
             bandwidth = estimate_bandwidth(X, n_jobs=self.n_jobs)

diff --git a/sklearn/cluster/_optics.py b/sklearn/cluster/_optics.py
@@ -245,7 +245,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, dtype=np.float)
+        X = self._validate_data(X, dtype=np.float)
 
         if self.cluster_method not in ['dbscan', 'xi']:
             raise ValueError("cluster_method should be one of"

diff --git a/sklearn/cluster/_spectral.py b/sklearn/cluster/_spectral.py
@@ -474,8 +474,8 @@ def fit(self, X, y=None):
         self
 
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype=np.float64, ensure_min_samples=2)
+        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
+                                dtype=np.float64, ensure_min_samples=2)
         allow_squared = self.affinity in ["precomputed",
                                           "precomputed_nearest_neighbors"]
         if X.shape[0] == X.shape[1] and not allow_squared:

diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
@@ -254,6 +254,17 @@ def test_wrong_shape():
         model.fit(data)
 
 
+@pytest.mark.parametrize('est',
+                         (SpectralBiclustering(), SpectralCoclustering()))
+def test_n_features_in_(est):
+
+    X, _, _ = make_biclusters((3, 3), 3, random_state=0)
+
+    assert not hasattr(est, 'n_features_in_')
+    est.fit(X)
+    assert est.n_features_in_ == 3
+
+
 @pytest.mark.parametrize("klass", [SpectralBiclustering, SpectralCoclustering])
 @pytest.mark.parametrize("n_jobs", [None, 1])
 def test_n_jobs_deprecated(klass, n_jobs):

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -513,6 +513,8 @@ def fit_transform(self, X, y=None):
         else:
             self._feature_names_in = None
         X = _check_X(X)
+        # set n_features_in_ attribute
+        self._check_n_features(X, reset=True)
         self._validate_transformers()
         self._validate_column_callables(X)
         self._validate_remainder(X)
@@ -587,6 +589,7 @@ def transform(self, X):
                                  'and for transform when using the '
                                  'remainder keyword')
 
+        # TODO: also call _check_n_features(reset=False) in 0.24
         self._validate_features(X.shape[1], X_feature_names)
         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
         self._validate_output(Xs)

diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
@@ -10,6 +10,7 @@
 from ..utils.validation import check_is_fitted
 from ..utils import check_array, _safe_indexing
 from ..preprocessing import FunctionTransformer
+from ..exceptions import NotFittedError
 
 __all__ = ['TransformedTargetRegressor']
 
@@ -235,3 +236,17 @@ def predict(self, X):
 
     def _more_tags(self):
         return {'poor_score': True, 'no_validation': True}
+
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() returns False the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+
+        return self.regressor_.n_features_in_
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -1187,6 +1187,18 @@ def test_column_transformer_mask_indexing(array_type):
     assert X_trans.shape == (3, 2)
 
 
+def test_n_features_in():
+    # make sure n_features_in is what is passed as input to the column
+    # transformer.
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    ct = ColumnTransformer([('a', DoubleTrans(), [0]),
+                            ('b', DoubleTrans(), [1])])
+    assert not hasattr(ct, 'n_features_in_')
+    ct.fit(X)
+    assert ct.n_features_in_ == 2
+
+
 @pytest.mark.parametrize('cols, pattern, include, exclude', [
     (['col_int', 'col_float'], None, np.number, None),
     (['col_int', 'col_float'], None, None, object),

diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
@@ -198,7 +198,7 @@ def fit(self, X, y=None):
         -------
         self : object
         """
-        X = check_array(X)
+        X = self._validate_data(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:

diff --git a/sklearn/covariance/_graph_lasso.py b/sklearn/covariance/_graph_lasso.py
@@ -382,8 +382,8 @@ def fit(self, X, y=None):
         self : object
         """
         # Covariance does not make sense for a single feature
-        X = check_array(X, ensure_min_features=2, ensure_min_samples=2,
-                        estimator=self)
+        X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2,
+                                estimator=self)
 
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
@@ -659,7 +659,7 @@ def fit(self, X, y=None):
         self : object
         """
         # Covariance does not make sense for a single feature
-        X = check_array(X, ensure_min_features=2, estimator=self)
+        X = self._validate_data(X, ensure_min_features=2, estimator=self)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:

diff --git a/sklearn/covariance/_robust_covariance.py b/sklearn/covariance/_robust_covariance.py
@@ -637,7 +637,7 @@ def fit(self, X, y=None):
         -------
         self : object
         """
-        X = check_array(X, ensure_min_samples=2, estimator='MinCovDet')
+        X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet')
         random_state = check_random_state(self.random_state)
         n_samples, n_features = X.shape
         # check that the empirical covariance is full rank

diff --git a/sklearn/covariance/_shrunk_covariance.py b/sklearn/covariance/_shrunk_covariance.py
@@ -140,7 +140,7 @@ def fit(self, X, y=None):
         -------
         self : object
         """
-        X = check_array(X)
+        X = self._validate_data(X)
         # Not calling the parent object to fit, to avoid a potential
         # matrix inversion when setting the precision
         if self.assume_centered:
@@ -412,7 +412,7 @@ def fit(self, X, y=None):
         """
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
-        X = check_array(X)
+        X = self._validate_data(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
@@ -562,7 +562,7 @@ def fit(self, X, y=None):
         -------
         self : object
         """
-        X = check_array(X)
+        X = self._validate_data(X)
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
         if self.assume_centered: