@@ -123,7 +123,7 @@ def strip_accents_unicode(s):
123
123
s : string
124
124
The string to strip
125
125
126
- See also
126
+ See Also
127
127
--------
128
128
strip_accents_ascii
129
129
Remove accentuated char for any unicode symbol that has a direct
@@ -150,7 +150,7 @@ def strip_accents_ascii(s):
150
150
s : string
151
151
The string to strip
152
152
153
- See also
153
+ See Also
154
154
--------
155
155
strip_accents_unicode
156
156
Remove accentuated char for any unicode symbol.
@@ -190,14 +190,19 @@ class _VectorizerMixin:
190
190
_white_spaces = re .compile (r"\s\s+" )
191
191
192
192
def decode (self , doc ):
193
- """Decode the input into a string of unicode symbols
193
+ """Decode the input into a string of unicode symbols.
194
194
195
195
The decoding strategy depends on the vectorizer parameters.
196
196
197
197
Parameters
198
198
----------
199
- doc : string
200
- The string to decode
199
+ doc : str
200
+ The string to decode.
201
+
202
+ Returns
203
+ -------
204
+ doc: str
205
+ A string of unicode symbols.
201
206
"""
202
207
if self .input == 'filename' :
203
208
with open (doc , 'rb' ) as fh :
@@ -298,7 +303,13 @@ def _char_wb_ngrams(self, text_document):
298
303
return ngrams
299
304
300
305
def build_preprocessor (self ):
301
- """Return a function to preprocess the text before tokenization"""
306
+ """Return a function to preprocess the text before tokenization.
307
+
308
+ Returns
309
+ -------
310
+ preprocessor: callable
311
+ A function to preprocess the text before tokenization.
312
+ """
302
313
if self .preprocessor is not None :
303
314
return self .preprocessor
304
315
@@ -320,14 +331,26 @@ def build_preprocessor(self):
320
331
)
321
332
322
333
def build_tokenizer (self ):
323
- """Return a function that splits a string into a sequence of tokens"""
334
+ """Return a function that splits a string into a sequence of tokens.
335
+
336
+ Returns
337
+ -------
338
+ tokenizer: callable
339
+ A function to split a string into a sequence of tokens.
340
+ """
324
341
if self .tokenizer is not None :
325
342
return self .tokenizer
326
343
token_pattern = re .compile (self .token_pattern )
327
344
return token_pattern .findall
328
345
329
346
def get_stop_words (self ):
330
- """Build or fetch the effective stop words list"""
347
+ """Build or fetch the effective stop words list.
348
+
349
+ Returns
350
+ -------
351
+ stop_words: list or None
352
+ A list of stop words.
353
+ """
331
354
return _check_stop_list (self .stop_words )
332
355
333
356
def _check_stop_words_consistency (self , stop_words , preprocess , tokenize ):
@@ -391,8 +414,13 @@ def _validate_custom_analyzer(self):
391
414
392
415
def build_analyzer (self ):
393
416
"""Return a callable that handles preprocessing, tokenization
394
-
395
417
and n-grams generation.
418
+
419
+ Returns
420
+ -------
421
+ analyzer: callable
422
+ A function to handle preprocessing, tokenization
423
+ and n-grams generation.
396
424
"""
397
425
398
426
if callable (self .analyzer ):
@@ -667,11 +695,12 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator):
667
695
>>> print(X.shape)
668
696
(4, 16)
669
697
670
- See also
698
+ See Also
671
699
--------
672
700
CountVectorizer, TfidfVectorizer
673
701
674
702
"""
703
+
675
704
def __init__ (self , input = 'content' , encoding = 'utf-8' ,
676
705
decode_error = 'strict' , strip_accents = None ,
677
706
lowercase = True , preprocessor = None , tokenizer = None ,
@@ -982,7 +1011,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator):
982
1011
[1 0 0 1 0 0 0 0 1 1 0 1 0]
983
1012
[0 0 1 0 1 0 1 0 0 0 0 0 1]]
984
1013
985
- See also
1014
+ See Also
986
1015
--------
987
1016
HashingVectorizer, TfidfVectorizer
988
1017
@@ -1249,6 +1278,7 @@ def inverse_transform(self, X):
1249
1278
Parameters
1250
1279
----------
1251
1280
X : {array-like, sparse matrix} of shape (n_samples, n_features)
1281
+ Document-term matrix.
1252
1282
1253
1283
Returns
1254
1284
-------
@@ -1274,7 +1304,13 @@ def inverse_transform(self, X):
1274
1304
for i in range (n_samples )]
1275
1305
1276
1306
def get_feature_names (self ):
1277
- """Array mapping from feature integer indices to feature name"""
1307
+ """Array mapping from feature integer indices to feature name.
1308
+
1309
+ Returns
1310
+ -------
1311
+ feature_names : list
1312
+ A list of feature names.
1313
+ """
1278
1314
1279
1315
self ._check_vocabulary ()
1280
1316
@@ -1504,7 +1540,7 @@ class TfidfVectorizer(CountVectorizer):
1504
1540
1505
1541
Parameters
1506
1542
----------
1507
- input : string {'filename', 'file', 'content'}
1543
+ input : str {'filename', 'file', 'content'}
1508
1544
If 'filename', the sequence passed as an argument to fit is
1509
1545
expected to be a list of filenames that need reading to fetch
1510
1546
the raw content to analyze.
@@ -1515,7 +1551,7 @@ class TfidfVectorizer(CountVectorizer):
1515
1551
Otherwise the input is expected to be a sequence of items that
1516
1552
can be of type string or byte.
1517
1553
1518
- encoding : string, 'utf-8' by default.
1554
+ encoding : str, default= 'utf-8'
1519
1555
If bytes or files are given to analyze, this encoding is used to
1520
1556
decode.
1521
1557
@@ -1536,7 +1572,7 @@ class TfidfVectorizer(CountVectorizer):
1536
1572
Both 'ascii' and 'unicode' use NFKD normalization from
1537
1573
:func:`unicodedata.normalize`.
1538
1574
1539
- lowercase : boolean (default=True)
1575
+ lowercase : bool (default=True)
1540
1576
Convert all characters to lowercase before tokenizing.
1541
1577
1542
1578
preprocessor : callable or None (default=None)
@@ -1549,7 +1585,7 @@ class TfidfVectorizer(CountVectorizer):
1549
1585
preprocessing and n-grams generation steps.
1550
1586
Only applies if ``analyzer == 'word'``.
1551
1587
1552
- analyzer : string , {'word', 'char', 'char_wb'} or callable
1588
+ analyzer : str , {'word', 'char', 'char_wb'} or callable
1553
1589
Whether the feature should be made of word or character n-grams.
1554
1590
Option 'char_wb' creates character n-grams only from text inside
1555
1591
word boundaries; n-grams at the edges of words are padded with space.
@@ -1563,7 +1599,7 @@ class TfidfVectorizer(CountVectorizer):
1563
1599
first read from the file and then passed to the given callable
1564
1600
analyzer.
1565
1601
1566
- stop_words : string {'english'}, list, or None (default=None)
1602
+ stop_words : str {'english'}, list, or None (default=None)
1567
1603
If a string, it is passed to _check_stop_list and the appropriate stop
1568
1604
list is returned. 'english' is currently the only supported string
1569
1605
value.
@@ -1578,7 +1614,7 @@ class TfidfVectorizer(CountVectorizer):
1578
1614
in the range [0.7, 1.0) to automatically detect and filter stop
1579
1615
words based on intra corpus document frequency of terms.
1580
1616
1581
- token_pattern : string
1617
+ token_pattern : str
1582
1618
Regular expression denoting what constitutes a "token", only used
1583
1619
if ``analyzer == 'word'``. The default regexp selects tokens of 2
1584
1620
or more alphanumeric characters (punctuation is completely ignored
@@ -1619,10 +1655,10 @@ class TfidfVectorizer(CountVectorizer):
1619
1655
indices in the feature matrix, or an iterable over terms. If not
1620
1656
given, a vocabulary is determined from the input documents.
1621
1657
1622
- binary : boolean (default=False)
1658
+ binary : bool (default=False)
1623
1659
If True, all non-zero term counts are set to 1. This does not mean
1624
1660
outputs will have only 0/1 values, only that the tf term in tf-idf
1625
- is binary. (Set idf and normalization to False to get 0/1 outputs.)
1661
+ is binary. (Set idf and normalization to False to get 0/1 outputs).
1626
1662
1627
1663
dtype : type, optional (default=float64)
1628
1664
Type of the matrix returned by fit_transform() or transform().
@@ -1633,25 +1669,25 @@ class TfidfVectorizer(CountVectorizer):
1633
1669
similarity between two vectors is their dot product when l2 norm has
1634
1670
been applied.
1635
1671
* 'l1': Sum of absolute values of vector elements is 1.
1636
- See :func:`preprocessing.normalize`
1672
+ See :func:`preprocessing.normalize`.
1637
1673
1638
- use_idf : boolean (default=True)
1674
+ use_idf : bool (default=True)
1639
1675
Enable inverse-document-frequency reweighting.
1640
1676
1641
- smooth_idf : boolean (default=True)
1677
+ smooth_idf : bool (default=True)
1642
1678
Smooth idf weights by adding one to document frequencies, as if an
1643
1679
extra document was seen containing every term in the collection
1644
1680
exactly once. Prevents zero divisions.
1645
1681
1646
- sublinear_tf : boolean (default=False)
1682
+ sublinear_tf : bool (default=False)
1647
1683
Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
1648
1684
1649
1685
Attributes
1650
1686
----------
1651
1687
vocabulary_ : dict
1652
1688
A mapping of terms to feature indices.
1653
1689
1654
- fixed_vocabulary_: boolean
1690
+ fixed_vocabulary_: bool
1655
1691
True if a fixed vocabulary of term to indices mapping
1656
1692
is provided by the user
1657
1693
@@ -1668,6 +1704,19 @@ class TfidfVectorizer(CountVectorizer):
1668
1704
1669
1705
This is only available if no vocabulary was given.
1670
1706
1707
+ See Also
1708
+ --------
1709
+ CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
1710
+
1711
+ TfidfTransformer : Performs the TF-IDF transformation from a provided
1712
+ matrix of counts.
1713
+
1714
+ Notes
1715
+ -----
1716
+ The ``stop_words_`` attribute can get large and increase the model size
1717
+ when pickling. This attribute is provided only for introspection and can
1718
+ be safely removed using delattr or set to None before pickling.
1719
+
1671
1720
Examples
1672
1721
--------
1673
1722
>>> from sklearn.feature_extraction.text import TfidfVectorizer
@@ -1683,19 +1732,6 @@ class TfidfVectorizer(CountVectorizer):
1683
1732
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
1684
1733
>>> print(X.shape)
1685
1734
(4, 9)
1686
-
1687
- See also
1688
- --------
1689
- CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
1690
-
1691
- TfidfTransformer : Performs the TF-IDF transformation from a provided
1692
- matrix of counts.
1693
-
1694
- Notes
1695
- -----
1696
- The ``stop_words_`` attribute can get large and increase the model size
1697
- when pickling. This attribute is provided only for introspection and can
1698
- be safely removed using delattr or set to None before pickling.
1699
1735
"""
1700
1736
1701
1737
def __init__ (self , input = 'content' , encoding = 'utf-8' ,
@@ -1782,11 +1818,14 @@ def fit(self, raw_documents, y=None):
1782
1818
Parameters
1783
1819
----------
1784
1820
raw_documents : iterable
1785
- an iterable which yields either str, unicode or file objects
1821
+ An iterable which yields either str, unicode or file objects.
1822
+ y : None
1823
+ This parameter is not needed to compute tfidf.
1786
1824
1787
1825
Returns
1788
1826
-------
1789
- self : TfidfVectorizer
1827
+ self : object
1828
+ Fitted vectorizer.
1790
1829
"""
1791
1830
self ._check_params ()
1792
1831
self ._warn_for_unused_params ()
@@ -1803,7 +1842,9 @@ def fit_transform(self, raw_documents, y=None):
1803
1842
Parameters
1804
1843
----------
1805
1844
raw_documents : iterable
1806
- an iterable which yields either str, unicode or file objects
1845
+ An iterable which yields either str, unicode or file objects.
1846
+ y : None
1847
+ This parameter is ignored.
1807
1848
1808
1849
Returns
1809
1850
-------
@@ -1826,9 +1867,9 @@ def transform(self, raw_documents, copy="deprecated"):
1826
1867
Parameters
1827
1868
----------
1828
1869
raw_documents : iterable
1829
- an iterable which yields either str, unicode or file objects
1870
+ An iterable which yields either str, unicode or file objects.
1830
1871
1831
- copy : boolean , default True
1872
+ copy : bool , default True
1832
1873
Whether to copy X and operate on the copy or perform in-place
1833
1874
operations.
1834
1875
0 commit comments