Skip to content

Commit 25a88b4

Browse files
hailey0huongrth
authored andcommitted
DOC docstrings validation in TfidfVectorizer (#15509)
1 parent 004426a commit 25a88b4

File tree

1 file changed

+84
-43
lines changed

1 file changed

+84
-43
lines changed

sklearn/feature_extraction/text.py

+84-43
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def strip_accents_unicode(s):
123123
s : string
124124
The string to strip
125125
126-
See also
126+
See Also
127127
--------
128128
strip_accents_ascii
129129
Remove accentuated char for any unicode symbol that has a direct
@@ -150,7 +150,7 @@ def strip_accents_ascii(s):
150150
s : string
151151
The string to strip
152152
153-
See also
153+
See Also
154154
--------
155155
strip_accents_unicode
156156
Remove accentuated char for any unicode symbol.
@@ -190,14 +190,19 @@ class _VectorizerMixin:
190190
_white_spaces = re.compile(r"\s\s+")
191191

192192
def decode(self, doc):
193-
"""Decode the input into a string of unicode symbols
193+
"""Decode the input into a string of unicode symbols.
194194
195195
The decoding strategy depends on the vectorizer parameters.
196196
197197
Parameters
198198
----------
199-
doc : string
200-
The string to decode
199+
doc : str
200+
The string to decode.
201+
202+
Returns
203+
-------
204+
doc: str
205+
A string of unicode symbols.
201206
"""
202207
if self.input == 'filename':
203208
with open(doc, 'rb') as fh:
@@ -298,7 +303,13 @@ def _char_wb_ngrams(self, text_document):
298303
return ngrams
299304

300305
def build_preprocessor(self):
301-
"""Return a function to preprocess the text before tokenization"""
306+
"""Return a function to preprocess the text before tokenization.
307+
308+
Returns
309+
-------
310+
preprocessor: callable
311+
A function to preprocess the text before tokenization.
312+
"""
302313
if self.preprocessor is not None:
303314
return self.preprocessor
304315

@@ -320,14 +331,26 @@ def build_preprocessor(self):
320331
)
321332

322333
def build_tokenizer(self):
323-
"""Return a function that splits a string into a sequence of tokens"""
334+
"""Return a function that splits a string into a sequence of tokens.
335+
336+
Returns
337+
-------
338+
tokenizer: callable
339+
A function to split a string into a sequence of tokens.
340+
"""
324341
if self.tokenizer is not None:
325342
return self.tokenizer
326343
token_pattern = re.compile(self.token_pattern)
327344
return token_pattern.findall
328345

329346
def get_stop_words(self):
330-
"""Build or fetch the effective stop words list"""
347+
"""Build or fetch the effective stop words list.
348+
349+
Returns
350+
-------
351+
stop_words: list or None
352+
A list of stop words.
353+
"""
331354
return _check_stop_list(self.stop_words)
332355

333356
def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
@@ -391,8 +414,13 @@ def _validate_custom_analyzer(self):
391414

392415
def build_analyzer(self):
393416
"""Return a callable that handles preprocessing, tokenization
394-
395417
and n-grams generation.
418+
419+
Returns
420+
-------
421+
analyzer: callable
422+
A function to handle preprocessing, tokenization
423+
and n-grams generation.
396424
"""
397425

398426
if callable(self.analyzer):
@@ -667,11 +695,12 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
667695
>>> print(X.shape)
668696
(4, 16)
669697
670-
See also
698+
See Also
671699
--------
672700
CountVectorizer, TfidfVectorizer
673701
674702
"""
703+
675704
def __init__(self, input='content', encoding='utf-8',
676705
decode_error='strict', strip_accents=None,
677706
lowercase=True, preprocessor=None, tokenizer=None,
@@ -982,7 +1011,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
9821011
[1 0 0 1 0 0 0 0 1 1 0 1 0]
9831012
[0 0 1 0 1 0 1 0 0 0 0 0 1]]
9841013
985-
See also
1014+
See Also
9861015
--------
9871016
HashingVectorizer, TfidfVectorizer
9881017
@@ -1249,6 +1278,7 @@ def inverse_transform(self, X):
12491278
Parameters
12501279
----------
12511280
X : {array-like, sparse matrix} of shape (n_samples, n_features)
1281+
Document-term matrix.
12521282
12531283
Returns
12541284
-------
@@ -1274,7 +1304,13 @@ def inverse_transform(self, X):
12741304
for i in range(n_samples)]
12751305

12761306
def get_feature_names(self):
1277-
"""Array mapping from feature integer indices to feature name"""
1307+
"""Array mapping from feature integer indices to feature name.
1308+
1309+
Returns
1310+
-------
1311+
feature_names : list
1312+
A list of feature names.
1313+
"""
12781314

12791315
self._check_vocabulary()
12801316

@@ -1504,7 +1540,7 @@ class TfidfVectorizer(CountVectorizer):
15041540
15051541
Parameters
15061542
----------
1507-
input : string {'filename', 'file', 'content'}
1543+
input : str {'filename', 'file', 'content'}
15081544
If 'filename', the sequence passed as an argument to fit is
15091545
expected to be a list of filenames that need reading to fetch
15101546
the raw content to analyze.
@@ -1515,7 +1551,7 @@ class TfidfVectorizer(CountVectorizer):
15151551
Otherwise the input is expected to be a sequence of items that
15161552
can be of type string or byte.
15171553
1518-
encoding : string, 'utf-8' by default.
1554+
encoding : str, default='utf-8'
15191555
If bytes or files are given to analyze, this encoding is used to
15201556
decode.
15211557
@@ -1536,7 +1572,7 @@ class TfidfVectorizer(CountVectorizer):
15361572
Both 'ascii' and 'unicode' use NFKD normalization from
15371573
:func:`unicodedata.normalize`.
15381574
1539-
lowercase : boolean (default=True)
1575+
lowercase : bool (default=True)
15401576
Convert all characters to lowercase before tokenizing.
15411577
15421578
preprocessor : callable or None (default=None)
@@ -1549,7 +1585,7 @@ class TfidfVectorizer(CountVectorizer):
15491585
preprocessing and n-grams generation steps.
15501586
Only applies if ``analyzer == 'word'``.
15511587
1552-
analyzer : string, {'word', 'char', 'char_wb'} or callable
1588+
analyzer : str, {'word', 'char', 'char_wb'} or callable
15531589
Whether the feature should be made of word or character n-grams.
15541590
Option 'char_wb' creates character n-grams only from text inside
15551591
word boundaries; n-grams at the edges of words are padded with space.
@@ -1563,7 +1599,7 @@ class TfidfVectorizer(CountVectorizer):
15631599
first read from the file and then passed to the given callable
15641600
analyzer.
15651601
1566-
stop_words : string {'english'}, list, or None (default=None)
1602+
stop_words : str {'english'}, list, or None (default=None)
15671603
If a string, it is passed to _check_stop_list and the appropriate stop
15681604
list is returned. 'english' is currently the only supported string
15691605
value.
@@ -1578,7 +1614,7 @@ class TfidfVectorizer(CountVectorizer):
15781614
in the range [0.7, 1.0) to automatically detect and filter stop
15791615
words based on intra corpus document frequency of terms.
15801616
1581-
token_pattern : string
1617+
token_pattern : str
15821618
Regular expression denoting what constitutes a "token", only used
15831619
if ``analyzer == 'word'``. The default regexp selects tokens of 2
15841620
or more alphanumeric characters (punctuation is completely ignored
@@ -1619,10 +1655,10 @@ class TfidfVectorizer(CountVectorizer):
16191655
indices in the feature matrix, or an iterable over terms. If not
16201656
given, a vocabulary is determined from the input documents.
16211657
1622-
binary : boolean (default=False)
1658+
binary : bool (default=False)
16231659
If True, all non-zero term counts are set to 1. This does not mean
16241660
outputs will have only 0/1 values, only that the tf term in tf-idf
1625-
is binary. (Set idf and normalization to False to get 0/1 outputs.)
1661+
is binary. (Set idf and normalization to False to get 0/1 outputs).
16261662
16271663
dtype : type, optional (default=float64)
16281664
Type of the matrix returned by fit_transform() or transform().
@@ -1633,25 +1669,25 @@ class TfidfVectorizer(CountVectorizer):
16331669
similarity between two vectors is their dot product when l2 norm has
16341670
been applied.
16351671
* 'l1': Sum of absolute values of vector elements is 1.
1636-
See :func:`preprocessing.normalize`
1672+
See :func:`preprocessing.normalize`.
16371673
1638-
use_idf : boolean (default=True)
1674+
use_idf : bool (default=True)
16391675
Enable inverse-document-frequency reweighting.
16401676
1641-
smooth_idf : boolean (default=True)
1677+
smooth_idf : bool (default=True)
16421678
Smooth idf weights by adding one to document frequencies, as if an
16431679
extra document was seen containing every term in the collection
16441680
exactly once. Prevents zero divisions.
16451681
1646-
sublinear_tf : boolean (default=False)
1682+
sublinear_tf : bool (default=False)
16471683
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
16481684
16491685
Attributes
16501686
----------
16511687
vocabulary_ : dict
16521688
A mapping of terms to feature indices.
16531689
1654-
fixed_vocabulary_: boolean
1690+
fixed_vocabulary_: bool
16551691
True if a fixed vocabulary of term to indices mapping
16561692
is provided by the user
16571693
@@ -1668,6 +1704,19 @@ class TfidfVectorizer(CountVectorizer):
16681704
16691705
This is only available if no vocabulary was given.
16701706
1707+
See Also
1708+
--------
1709+
CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
1710+
1711+
TfidfTransformer : Performs the TF-IDF transformation from a provided
1712+
matrix of counts.
1713+
1714+
Notes
1715+
-----
1716+
The ``stop_words_`` attribute can get large and increase the model size
1717+
when pickling. This attribute is provided only for introspection and can
1718+
be safely removed using delattr or set to None before pickling.
1719+
16711720
Examples
16721721
--------
16731722
>>> from sklearn.feature_extraction.text import TfidfVectorizer
@@ -1683,19 +1732,6 @@ class TfidfVectorizer(CountVectorizer):
16831732
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
16841733
>>> print(X.shape)
16851734
(4, 9)
1686-
1687-
See also
1688-
--------
1689-
CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
1690-
1691-
TfidfTransformer : Performs the TF-IDF transformation from a provided
1692-
matrix of counts.
1693-
1694-
Notes
1695-
-----
1696-
The ``stop_words_`` attribute can get large and increase the model size
1697-
when pickling. This attribute is provided only for introspection and can
1698-
be safely removed using delattr or set to None before pickling.
16991735
"""
17001736

17011737
def __init__(self, input='content', encoding='utf-8',
@@ -1782,11 +1818,14 @@ def fit(self, raw_documents, y=None):
17821818
Parameters
17831819
----------
17841820
raw_documents : iterable
1785-
an iterable which yields either str, unicode or file objects
1821+
An iterable which yields either str, unicode or file objects.
1822+
y : None
1823+
This parameter is not needed to compute tfidf.
17861824
17871825
Returns
17881826
-------
1789-
self : TfidfVectorizer
1827+
self : object
1828+
Fitted vectorizer.
17901829
"""
17911830
self._check_params()
17921831
self._warn_for_unused_params()
@@ -1803,7 +1842,9 @@ def fit_transform(self, raw_documents, y=None):
18031842
Parameters
18041843
----------
18051844
raw_documents : iterable
1806-
an iterable which yields either str, unicode or file objects
1845+
An iterable which yields either str, unicode or file objects.
1846+
y : None
1847+
This parameter is ignored.
18071848
18081849
Returns
18091850
-------
@@ -1826,9 +1867,9 @@ def transform(self, raw_documents, copy="deprecated"):
18261867
Parameters
18271868
----------
18281869
raw_documents : iterable
1829-
an iterable which yields either str, unicode or file objects
1870+
An iterable which yields either str, unicode or file objects.
18301871
1831-
copy : boolean, default True
1872+
copy : bool, default True
18321873
Whether to copy X and operate on the copy or perform in-place
18331874
operations.
18341875

0 commit comments

Comments
 (0)