Skip to content

MNT add n_features_in_ through the feature_extraction module #20180

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions sklearn/feature_extraction/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,15 +504,6 @@ def test_vectorizer():
with pytest.raises(ValueError):
t3.transform(counts_train)

# test idf transform with incompatible n_features
X = [[1, 1, 5],
[1, 1, 0]]
t3.fit(X)
X_incompt = [[1, 3],
[1, 3]]
with pytest.raises(ValueError):
t3.transform(X_incompt)
Comment on lines -507 to -514
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is now checked through the common tests


# L1-normalized term frequencies sum to one
assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

Expand Down
15 changes: 8 additions & 7 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,11 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):

.. versionadded:: 0.20

n_features_in_ : int
Number of features seen during :term:`fit`.

.. versionadded:: 1.0

Examples
--------
>>> from sklearn.feature_extraction.text import TfidfTransformer
Expand Down Expand Up @@ -1436,7 +1441,7 @@ def fit(self, X, y=None):
X : sparse matrix of shape n_samples, n_features)
A matrix of term/token counts.
"""
X = check_array(X, accept_sparse=('csr', 'csc'))
X = self._validate_data(X, accept_sparse=('csr', 'csc'))
if not sp.issparse(X):
X = sp.csr_matrix(X)
dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64
Expand Down Expand Up @@ -1476,7 +1481,8 @@ def transform(self, X, copy=True):
-------
vectors : sparse matrix of shape (n_samples, n_features)
"""
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
X = self._validate_data(X, accept_sparse='csr',
dtype=FLOAT_DTYPES, copy=copy, reset=False)
if not sp.issparse(X):
X = sp.csr_matrix(X, dtype=np.float64)

Expand All @@ -1493,11 +1499,6 @@ def transform(self, X, copy=True):
check_is_fitted(self, attributes=["idf_"],
msg='idf vector is not fitted')

expected_n_features = self._idf_diag.shape[0]
if n_features != expected_n_features:
raise ValueError("Input has n_features=%d while the model"
" has been trained with n_features=%d" % (
n_features, expected_n_features))
# *= doesn't work
X = X * self._idf_diag

Expand Down
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,6 @@ def test_search_cv(estimator, check, request):
# check_classifiers_train would need to be updated with the error message
N_FEATURES_IN_AFTER_FIT_MODULES_TO_IGNORE = {
'compose',
'feature_extraction',
'model_selection',
'multiclass',
'multioutput',
Expand Down
3 changes: 2 additions & 1 deletion sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -3121,7 +3121,8 @@ def check_n_features_in_after_fitting(name, estimator_orig):
# Make sure that n_features_in are checked after fitting
tags = _safe_tags(estimator_orig)

if "2darray" not in tags["X_types"] or tags["no_validation"]:
if ("2darray" not in tags["X_types"] and "sparse" not in tags["X_types"] or
tags["no_validation"]):
return

rng = np.random.RandomState(0)
Expand Down