scikit-learn · NicolasHug · Apr 9, 2019 · Apr 19, 2019 · Apr 19, 2019 · Apr 19, 2019
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -14,6 +14,8 @@
 
 from . import __version__
 from .utils import _IS_32BIT
+from .utils.validation import check_X_y
+from .utils.validation import check_array
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -323,6 +325,31 @@ def _get_tags(self):
                 collected_tags.update(more_tags)
         return collected_tags
 
+    def _validate_n_features(self, X, check_n_features):
+        if check_n_features:
+            if not hasattr(self, 'n_features_in_'):
+                raise RuntimeError(
+                    "check_n_features is True but there is no n_features_in_ "
+                    "attribute."
+                )
+            if X.shape[1] != self.n_features_in_:
+                raise ValueError(
+                    'X has {} features, but this {} is expecting {} features '
+                    'as input.'.format(X.shape[1], self.__class__.__name__,
+                                       self.n_features_in_)
+                )
+        else:
+            self.n_features_in_ = X.shape[1]
+
+    def _validate_X(self, X, check_n_features=False, **check_array_params):
+        X = check_array(X, **check_array_params)
+        self._validate_n_features(X, check_n_features)
+        return X
+
+    def _validate_X_y(self, X, y, check_n_features=False, **check_X_y_params):
+        X, y = check_X_y(X, y, **check_X_y_params)
+        self._validate_n_features(X, check_n_features)
+        return X, y
 
 class ClassifierMixin:
     """Mixin class for all classifiers in scikit-learn."""

diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -130,8 +130,8 @@ def fit(self, X, y, sample_weight=None):
         self : object
             Returns an instance of self.
         """
-        X, y = check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
-                         force_all_finite=False, allow_nd=True)
+        X, y = self._validate_X_y(X, y, accept_sparse=['csc', 'csr', 'coo'],
+                                  force_all_finite=False, allow_nd=True)
         X, y = indexable(X, y)
         le = LabelBinarizer().fit(y)
         self.classes_ = le.classes_

diff --git a/sklearn/cluster/affinity_propagation_.py b/sklearn/cluster/affinity_propagation_.py
@@ -372,7 +372,7 @@ def fit(self, X, y=None):
             accept_sparse = False
         else:
             accept_sparse = 'csr'
-        X = check_array(X, accept_sparse=accept_sparse)
+        X = self._validate_X(X, accept_sparse=accept_sparse)
         if self.affinity == "precomputed":
             self.affinity_matrix_ = X
         elif self.affinity == "euclidean":

diff --git a/sklearn/cluster/bicluster.py b/sklearn/cluster/bicluster.py
@@ -115,7 +115,7 @@ def fit(self, X, y=None):
         y : Ignored
 
         """
-        X = check_array(X, accept_sparse='csr', dtype=np.float64)
+        X = self._validate_X(X, accept_sparse='csr', dtype=np.float64)
         self._check_parameters()
         self._fit(X)
         return self

diff --git a/sklearn/cluster/birch.py b/sklearn/cluster/birch.py
@@ -445,7 +445,7 @@ def fit(self, X, y=None):
         return self._fit(X)
 
     def _fit(self, X):
-        X = check_array(X, accept_sparse='csr', copy=self.copy)
+        X = self._validate_X(X, accept_sparse='csr', copy=self.copy)
         threshold = self.threshold
         branching_factor = self.branching_factor
 

diff --git a/sklearn/cluster/dbscan_.py b/sklearn/cluster/dbscan_.py
@@ -306,7 +306,7 @@ def fit(self, X, y=None, sample_weight=None):
         self
 
         """
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_X(X, accept_sparse='csr')
 
         if not self.eps > 0.0:
             raise ValueError("eps must be positive.")

diff --git a/sklearn/cluster/hierarchical.py b/sklearn/cluster/hierarchical.py
@@ -790,7 +790,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X, ensure_min_samples=2, estimator=self)
+        X = self._validate_X(X, ensure_min_samples=2, estimator=self)
         memory = check_memory(self.memory)
 
         if self.n_clusters is not None and self.n_clusters <= 0:
@@ -1034,9 +1034,14 @@ def fit(self, X, y=None, **params):
         -------
         self
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        ensure_min_features=2, estimator=self)
-        return AgglomerativeClustering.fit(self, X.T, **params)
+        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
+                             ensure_min_features=2, estimator=self)
+        n_features_in_ = self.n_features_in_
+        AgglomerativeClustering.fit(self, X.T, **params)
+        # Need to restore n_features_in_ attribute that was overridden in
+        # AgglomerativeClustering since we passed it X.T.
+        self.n_features_in_ = n_features_in_
+        return self
 
     @property
     def fit_predict(self):

diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
@@ -852,8 +852,9 @@ def fit(self, X, y=None, sample_weight=None):
 
         # avoid forcing order when copy_x=False
         order = "C" if self.copy_x else None
-        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32],
-                        order=order, copy=self.copy_x)
+        X = self._validate_X(X, accept_sparse='csr',
+                             dtype=[np.float64, np.float32],
+                             order=order, copy=self.copy_x)
         # verify that the number of samples given is larger than k
         if _num_samples(X) < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d" % (
@@ -1497,8 +1498,8 @@ def fit(self, X, y=None, sample_weight=None):
 
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X, accept_sparse="csr", order='C',
-                        dtype=[np.float64, np.float32])
+        X = self._validate_X(X, accept_sparse="csr", order='C',
+                             dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
         if n_samples < self.n_clusters:
             raise ValueError("n_samples=%d should be >= n_clusters=%d"

diff --git a/sklearn/cluster/mean_shift_.py b/sklearn/cluster/mean_shift_.py
@@ -414,7 +414,7 @@ def fit(self, X, y=None):
         y : Ignored
 
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         self.cluster_centers_, self.labels_ = \
             mean_shift(X, bandwidth=self.bandwidth, seeds=self.seeds,
                        min_bin_freq=self.min_bin_freq,

diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
@@ -233,7 +233,7 @@ def fit(self, X, y=None):
         self : instance of OPTICS
             The instance.
         """
-        X = check_array(X, dtype=np.float)
+        X = self._validate_X(X, dtype=np.float)
 
         if self.cluster_method not in ['dbscan', 'xi']:
             raise ValueError("cluster_method should be one of"

diff --git a/sklearn/cluster/spectral.py b/sklearn/cluster/spectral.py
@@ -474,8 +474,8 @@ def fit(self, X, y=None):
         self
 
         """
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'],
-                        dtype=np.float64, ensure_min_samples=2)
+        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'coo'],
+                             dtype=np.float64, ensure_min_samples=2)
         allow_squared = self.affinity in ["precomputed",
                                           "precomputed_nearest_neighbors"]
         if X.shape[0] == X.shape[1] and not allow_squared:

diff --git a/sklearn/cluster/tests/test_bicluster.py b/sklearn/cluster/tests/test_bicluster.py
@@ -256,3 +256,14 @@ def test_wrong_shape():
     data = np.arange(27).reshape((3, 3, 3))
     with pytest.raises(ValueError):
         model.fit(data)
+
+
+@pytest.mark.parametrize('est',
+                         (SpectralBiclustering(), SpectralCoclustering()))
+def test_n_features_in_(est):
+
+    X, _, _ = make_biclusters((3, 3), 3, random_state=0)
+
+    assert not hasattr(est, 'n_features_in_')
+    est.fit(X)
+    assert est.n_features_in_ == 3
diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -506,6 +506,8 @@ def fit_transform(self, X, y=None):
         else:
             self._feature_names_in = None
         X = _check_X(X)
+        # set n_features_in_ attribute
+        self._validate_n_features(X, check_n_features=False)
         self._validate_transformers()
         self._validate_column_callables(X)
         self._validate_remainder(X)
@@ -579,6 +581,7 @@ def transform(self, X):
                                  'and for transform when using the '
                                  'remainder keyword')
 
+        # TODO: also call _validate_n_features(check_n_features=True) in 0.24
         self._validate_features(X.shape[1], X_feature_names)
         Xs = self._fit_transform(X, None, _transform_one, fitted=True)
         self._validate_output(Xs)

diff --git a/sklearn/compose/_target.py b/sklearn/compose/_target.py
@@ -10,6 +10,7 @@
 from ..utils.validation import check_is_fitted
 from ..utils import check_array, safe_indexing
 from ..preprocessing import FunctionTransformer
+from ..exceptions import NotFittedError
 
 __all__ = ['TransformedTargetRegressor']
 
@@ -234,3 +235,17 @@ def predict(self, X):
 
     def _more_tags(self):
         return {'poor_score': True, 'no_validation': True}
+
+    @property
+    def n_features_in_(self):
+        # For consistency with other estimators we raise a AttributeError so
+        # that hasattr() fails if the estimator isn't fitted.
+        try:
+            check_is_fitted(self)
+        except NotFittedError as nfe:
+            raise AttributeError(
+                "{} object has no n_features_in_ attribute."
+                .format(self.__class__.__name__)
+            ) from nfe
+
+        return self.regressor_.n_features_in_
diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -1180,3 +1180,15 @@ def test_column_transformer_mask_indexing(array_type):
     )
     X_trans = column_transformer.fit_transform(X)
     assert X_trans.shape == (3, 2)
+
+
+def test_n_features_in():
+    # make sure n_features_in is what is passed as input to the column
+    # transformer.
+
+    X = [[1, 2], [3, 4], [5, 6]]
+    ct = ColumnTransformer([('a', DoubleTrans(), [0]),
+                            ('b', DoubleTrans(), [1])])
+    assert not hasattr(ct, 'n_features_in_')
+    ct.fit(X)
+    assert ct.n_features_in_ == 2
diff --git a/sklearn/covariance/empirical_covariance_.py b/sklearn/covariance/empirical_covariance_.py
@@ -191,7 +191,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:

diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py
@@ -378,8 +378,8 @@ def fit(self, X, y=None):
         y : (ignored)
         """
         # Covariance does not make sense for a single feature
-        X = check_array(X, ensure_min_features=2, ensure_min_samples=2,
-                        estimator=self)
+        X = self._validate_X(X, ensure_min_features=2, ensure_min_samples=2,
+                             estimator=self)
 
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
@@ -645,7 +645,7 @@ def fit(self, X, y=None):
         y : (ignored)
         """
         # Covariance does not make sense for a single feature
-        X = check_array(X, ensure_min_features=2, estimator=self)
+        X = self._validate_X(X, ensure_min_features=2, estimator=self)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:

diff --git a/sklearn/covariance/robust_covariance.py b/sklearn/covariance/robust_covariance.py
@@ -636,7 +636,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = check_array(X, ensure_min_samples=2, estimator='MinCovDet')
+        X = self._validate_X(X, ensure_min_samples=2, estimator='MinCovDet')
         random_state = check_random_state(self.random_state)
         n_samples, n_features = X.shape
         # check that the empirical covariance is full rank

diff --git a/sklearn/covariance/shrunk_covariance_.py b/sklearn/covariance/shrunk_covariance_.py
@@ -143,7 +143,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         # Not calling the parent object to fit, to avoid a potential
         # matrix inversion when setting the precision
         if self.assume_centered:
@@ -419,7 +419,7 @@ def fit(self, X, y=None):
         """
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
-        X = check_array(X)
+        X = self._validate_X(X)
         if self.assume_centered:
             self.location_ = np.zeros(X.shape[1])
         else:
@@ -572,7 +572,7 @@ def fit(self, X, y=None):
         self : object
 
         """
-        X = check_array(X)
+        X = self._validate_X(X)
         # Not calling the parent object to fit, to avoid computing the
         # covariance matrix (and potentially the precision)
         if self.assume_centered:

diff --git a/sklearn/cross_decomposition/pls_.py b/sklearn/cross_decomposition/pls_.py
@@ -252,8 +252,8 @@ def fit(self, X, Y):
 
         # copy since this will contains the residuals (deflated) matrices
         check_consistent_length(X, Y)
-        X = check_array(X, dtype=np.float64, copy=self.copy,
-                        ensure_min_samples=2)
+        X = self._validate_X(X, dtype=np.float64, copy=self.copy,
+                             ensure_min_samples=2)
         Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
         if Y.ndim == 1:
             Y = Y.reshape(-1, 1)
@@ -828,8 +828,8 @@ def fit(self, X, Y):
         """
         # copy since this will contains the centered data
         check_consistent_length(X, Y)
-        X = check_array(X, dtype=np.float64, copy=self.copy,
-                        ensure_min_samples=2)
+        X = self._validate_X(X, dtype=np.float64, copy=self.copy,
+                             ensure_min_samples=2)
         Y = check_array(Y, dtype=np.float64, copy=self.copy, ensure_2d=False)
         if Y.ndim == 1:
             Y = Y.reshape(-1, 1)

diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py
@@ -1044,6 +1044,10 @@ def fit(self, X, y=None):
         """
         return self
 
+    @property
+    def n_features_in_(self):
+        return self.components_.shape[1]
+
 
 class DictionaryLearning(SparseCodingMixin, BaseEstimator):
     """Dictionary learning
@@ -1217,7 +1221,7 @@ def fit(self, X, y=None):
             Returns the object itself
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X)
+        X = self._validate_X(X)
         if self.n_components is None:
             n_components = X.shape[1]
         else:
@@ -1423,7 +1427,7 @@ def fit(self, X, y=None):
             Returns the instance itself.
         """
         random_state = check_random_state(self.random_state)
-        X = check_array(X)
+        X = self._validate_X(X)
 
         U, (A, B), self.n_iter_ = dict_learning_online(
             X, self.n_components, self.alpha,

diff --git a/sklearn/decomposition/factor_analysis.py b/sklearn/decomposition/factor_analysis.py
@@ -167,7 +167,7 @@ def fit(self, X, y=None):
         -------
         self
         """
-        X = check_array(X, copy=self.copy, dtype=np.float64)
+        X = self._validate_X(X, copy=self.copy, dtype=np.float64)
 
         n_samples, n_features = X.shape
         n_components = self.n_components

diff --git a/sklearn/decomposition/fastica_.py b/sklearn/decomposition/fastica_.py
@@ -501,6 +501,11 @@ def _fit(self, X, compute_sources=False):
         -------
             X_new : array-like, shape (n_samples, n_components)
         """
+
+        # This validates twice but there is not clean way to avoid validation
+        # in fastica(). Please see issue 14897.
+        self._validate_X(X, copy=self.whiten, dtype=FLOAT_DTYPES,
+                         ensure_min_samples=2).T
         fun_args = {} if self.fun_args is None else self.fun_args
         whitening, unmixing, sources, X_mean, self.n_iter_ = fastica(
             X=X, n_components=self.n_components, algorithm=self.algorithm,

diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py
@@ -192,8 +192,8 @@ def fit(self, X, y=None):
         self.singular_values_ = None
         self.noise_variance_ = None
 
-        X = check_array(X, accept_sparse=['csr', 'csc', 'lil'],
-                        copy=self.copy, dtype=[np.float64, np.float32])
+        X = self._validate_X(X, accept_sparse=['csr', 'csc', 'lil'],
+                             copy=self.copy, dtype=[np.float64, np.float32])
         n_samples, n_features = X.shape
 
         if self.batch_size is None:

diff --git a/sklearn/decomposition/kernel_pca.py b/sklearn/decomposition/kernel_pca.py
@@ -271,7 +271,7 @@ def fit(self, X, y=None):
         self : object
             Returns the instance itself.
         """
-        X = check_array(X, accept_sparse='csr', copy=self.copy_X)
+        X = self._validate_X(X, accept_sparse='csr', copy=self.copy_X)
         self._centerer = KernelCenterer()
         K = self._get_kernel(X)
         self._fit_transform(K)