Skip to content

DOC Ensures numpydoc validation for CountVectorizer and TfidfVectorizer/Transformer #20403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions maint_tools/test_docstrings.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
"CalibratedClassifierCV",
"ClassifierChain",
"ColumnTransformer",
"CountVectorizer",
"DecisionTreeRegressor",
"DictVectorizer",
"DictionaryLearning",
Expand Down Expand Up @@ -140,7 +139,6 @@
"StackingClassifier",
"StackingRegressor",
"TSNE",
"TfidfVectorizer",
"TheilSenRegressor",
"TransformedTargetRegressor",
"TruncatedSVD",
Expand Down
96 changes: 60 additions & 36 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,10 @@ def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
return "error"

def build_analyzer(self):
"""Return a callable that handles preprocessing, tokenization
and n-grams generation.
"""Return a callable to process input data.

The callable handles that handles preprocessing, tokenization, and
n-grams generation.

Returns
-------
Expand Down Expand Up @@ -862,7 +864,7 @@ def _document_frequency(X):


class CountVectorizer(_VectorizerMixin, BaseEstimator):
r"""Convert a collection of text documents to a matrix of token counts
r"""Convert a collection of text documents to a matrix of token counts.

This implementation produces a sparse representation of the counts using
scipy.sparse.csr_matrix.
Expand All @@ -886,7 +888,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
- If `'content'`, the input is expected to be a sequence of items that
can be of type string or byte.

encoding : string, default='utf-8'
encoding : str, default='utf-8'
If bytes or files are given to analyze, this encoding is used to
decode.

Expand Down Expand Up @@ -1021,6 +1023,20 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):

This is only available if no vocabulary was given.

See Also
--------
HashingVectorizer : Convert a collection of text documents to a
matrix of token counts.

TfidfVectorizer : Convert a collection of raw documents to a matrix
of TF-IDF features.

Notes
-----
The ``stop_words_`` attribute can get large and increase the model size
when pickling. This attribute is provided only for introspection and can
be safely removed using delattr or set to None before pickling.

Examples
--------
>>> from sklearn.feature_extraction.text import CountVectorizer
Expand Down Expand Up @@ -1050,16 +1066,6 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
[0 1 0 1 0 1 0 1 0 0 1 0 0]
[1 0 0 1 0 0 0 0 1 1 0 1 0]
[0 0 1 0 1 0 1 0 0 0 0 0 1]]

See Also
--------
HashingVectorizer, TfidfVectorizer

Notes
-----
The ``stop_words_`` attribute can get large and increase the model size
when pickling. This attribute is provided only for introspection and can
be safely removed using delattr or set to None before pickling.
"""

def __init__(
Expand Down Expand Up @@ -1246,11 +1252,15 @@ def fit(self, raw_documents, y=None):
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
An iterable which generates either str, unicode or file objects.

y : None
This parameter is ignored.

Returns
-------
self
self : object
Fitted vectorizer.
"""
self._warn_for_unused_params()
self.fit_transform(raw_documents)
Expand All @@ -1265,7 +1275,10 @@ def fit_transform(self, raw_documents, y=None):
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
An iterable which generates either str, unicode or file objects.

y : None
This parameter is ignored.

Returns
-------
Expand Down Expand Up @@ -1321,7 +1334,7 @@ def transform(self, raw_documents):
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
An iterable which generates either str, unicode or file objects.

Returns
-------
Expand Down Expand Up @@ -1442,11 +1455,12 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):
----------
norm : {'l1', 'l2'}, default='l2'
Each output row will have unit norm, either:
* 'l2': Sum of squares of vector elements is 1. The cosine
similarity between two vectors is their dot product when l2 norm has
been applied.
* 'l1': Sum of absolute values of vector elements is 1.
See :func:`preprocessing.normalize`.

- 'l2': Sum of squares of vector elements is 1. The cosine
similarity between two vectors is their dot product when l2 norm has
been applied.
- 'l1': Sum of absolute values of vector elements is 1.
See :func:`preprocessing.normalize`.

use_idf : bool, default=True
Enable inverse-document-frequency reweighting.
Expand Down Expand Up @@ -1484,7 +1498,6 @@ class TfidfTransformer(TransformerMixin, BaseEstimator):

References
----------

.. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
Information Retrieval. Addison Wesley, pp. 68-74.

Expand Down Expand Up @@ -1610,13 +1623,11 @@ def transform(self, X, copy=True):

@property
def idf_(self):
"""Return the inverse document frecuency (IDF) vector.
"""Inverse document frequency vector, only defined if `use_idf=True`.

Returns
-------
idf_ : ndarray of shape (n_features,)
The inverse document frequency (IDF) vector; only defined
if `use_idf` is True.
ndarray of shape (n_features,)
"""
# if _idf_diag is not set, this will raise an attribute error,
# which means hasattr(self, "idf_") is False
Expand Down Expand Up @@ -1772,11 +1783,12 @@ class TfidfVectorizer(CountVectorizer):

norm : {'l1', 'l2'}, default='l2'
Each output row will have unit norm, either:
* 'l2': Sum of squares of vector elements is 1. The cosine
similarity between two vectors is their dot product when l2 norm has
been applied.
* 'l1': Sum of absolute values of vector elements is 1.
See :func:`preprocessing.normalize`.

- 'l2': Sum of squares of vector elements is 1. The cosine
similarity between two vectors is their dot product when l2 norm has
been applied.
- 'l1': Sum of absolute values of vector elements is 1.
See :func:`preprocessing.normalize`.

use_idf : bool, default=True
Enable inverse-document-frequency reweighting.
Expand Down Expand Up @@ -1896,6 +1908,7 @@ def __init__(

@property
def norm(self):
"""Norm of each row output, can be either "l1" or "l2"."""
return self._tfidf.norm

@norm.setter
Expand All @@ -1904,6 +1917,7 @@ def norm(self, value):

@property
def use_idf(self):
"""Whether or not IDF re-weighting is used."""
return self._tfidf.use_idf

@use_idf.setter
Expand All @@ -1912,6 +1926,7 @@ def use_idf(self, value):

@property
def smooth_idf(self):
"""Whether or not IDF weights are smoothed."""
return self._tfidf.smooth_idf

@smooth_idf.setter
Expand All @@ -1920,6 +1935,7 @@ def smooth_idf(self, value):

@property
def sublinear_tf(self):
"""Whether or not sublinear TF scaling is applied."""
return self._tfidf.sublinear_tf

@sublinear_tf.setter
Expand All @@ -1928,6 +1944,12 @@ def sublinear_tf(self, value):

@property
def idf_(self):
"""Inverse document frequency vector, only defined if `use_idf=True`.

Returns
-------
ndarray of shape (n_features,)
"""
return self._tfidf.idf_

@idf_.setter
Expand Down Expand Up @@ -1955,7 +1977,8 @@ def fit(self, raw_documents, y=None):
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
An iterable which generates either str, unicode or file objects.

y : None
This parameter is not needed to compute tfidf.

Expand All @@ -1979,7 +2002,8 @@ def fit_transform(self, raw_documents, y=None):
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
An iterable which generates either str, unicode or file objects.

y : None
This parameter is ignored.

Expand All @@ -2004,7 +2028,7 @@ def transform(self, raw_documents):
Parameters
----------
raw_documents : iterable
An iterable which yields either str, unicode or file objects.
An iterable which generates either str, unicode or file objects.

Returns
-------
Expand Down