diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index dd293531184c3..5f43d78f48c82 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -123,7 +123,7 @@ def strip_accents_unicode(s): s : string The string to strip - See also + See Also -------- strip_accents_ascii Remove accentuated char for any unicode symbol that has a direct @@ -150,7 +150,7 @@ def strip_accents_ascii(s): s : string The string to strip - See also + See Also -------- strip_accents_unicode Remove accentuated char for any unicode symbol. @@ -190,14 +190,19 @@ class _VectorizerMixin: _white_spaces = re.compile(r"\s\s+") def decode(self, doc): - """Decode the input into a string of unicode symbols + """Decode the input into a string of unicode symbols. The decoding strategy depends on the vectorizer parameters. Parameters ---------- - doc : string - The string to decode + doc : str + The string to decode. + + Returns + ------- + doc: str + A string of unicode symbols. """ if self.input == 'filename': with open(doc, 'rb') as fh: @@ -298,7 +303,13 @@ def _char_wb_ngrams(self, text_document): return ngrams def build_preprocessor(self): - """Return a function to preprocess the text before tokenization""" + """Return a function to preprocess the text before tokenization. + + Returns + ------- + preprocessor: callable + A function to preprocess the text before tokenization. + """ if self.preprocessor is not None: return self.preprocessor @@ -320,14 +331,26 @@ def build_preprocessor(self): ) def build_tokenizer(self): - """Return a function that splits a string into a sequence of tokens""" + """Return a function that splits a string into a sequence of tokens. + + Returns + ------- + tokenizer: callable + A function to split a string into a sequence of tokens. + """ if self.tokenizer is not None: return self.tokenizer token_pattern = re.compile(self.token_pattern) return token_pattern.findall def get_stop_words(self): - """Build or fetch the effective stop words list""" + """Build or fetch the effective stop words list. + + Returns + ------- + stop_words: list or None + A list of stop words. + """ return _check_stop_list(self.stop_words) def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): @@ -391,8 +414,13 @@ def _validate_custom_analyzer(self): def build_analyzer(self): """Return a callable that handles preprocessing, tokenization - and n-grams generation. + + Returns + ------- + analyzer: callable + A function to handle preprocessing, tokenization + and n-grams generation. """ if callable(self.analyzer): @@ -667,11 +695,12 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): >>> print(X.shape) (4, 16) - See also + See Also -------- CountVectorizer, TfidfVectorizer """ + def __init__(self, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, @@ -970,7 +999,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): [1 0 0 1 1 0 1 1 1] [0 1 1 1 0 0 1 0 1]] - See also + See Also -------- HashingVectorizer, TfidfVectorizer @@ -1237,6 +1266,7 @@ def inverse_transform(self, X): Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document-term matrix. Returns ------- @@ -1262,7 +1292,13 @@ def inverse_transform(self, X): for i in range(n_samples)] def get_feature_names(self): - """Array mapping from feature integer indices to feature name""" + """Array mapping from feature integer indices to feature name. + + Returns + ------- + feature_names : list + A list of feature names. + """ self._check_vocabulary() @@ -1492,7 +1528,7 @@ class TfidfVectorizer(CountVectorizer): Parameters ---------- - input : string {'filename', 'file', 'content'} + input : str {'filename', 'file', 'content'} If 'filename', the sequence passed as an argument to fit is expected to be a list of filenames that need reading to fetch the raw content to analyze. @@ -1503,7 +1539,7 @@ class TfidfVectorizer(CountVectorizer): Otherwise the input is expected to be a sequence of items that can be of type string or byte. - encoding : string, 'utf-8' by default. + encoding : str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode. @@ -1524,7 +1560,7 @@ class TfidfVectorizer(CountVectorizer): Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. - lowercase : boolean (default=True) + lowercase : bool (default=True) Convert all characters to lowercase before tokenizing. preprocessor : callable or None (default=None) @@ -1537,7 +1573,7 @@ class TfidfVectorizer(CountVectorizer): preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``. - analyzer : string, {'word', 'char', 'char_wb'} or callable + analyzer : str, {'word', 'char', 'char_wb'} or callable Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. @@ -1551,7 +1587,7 @@ class TfidfVectorizer(CountVectorizer): first read from the file and then passed to the given callable analyzer. - stop_words : string {'english'}, list, or None (default=None) + stop_words : str {'english'}, list, or None (default=None) If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. @@ -1566,7 +1602,7 @@ class TfidfVectorizer(CountVectorizer): in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms. - token_pattern : string + token_pattern : str Regular expression denoting what constitutes a "token", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored @@ -1607,10 +1643,10 @@ class TfidfVectorizer(CountVectorizer): indices in the feature matrix, or an iterable over terms. If not given, a vocabulary is determined from the input documents. - binary : boolean (default=False) + binary : bool (default=False) If True, all non-zero term counts are set to 1. This does not mean outputs will have only 0/1 values, only that the tf term in tf-idf - is binary. (Set idf and normalization to False to get 0/1 outputs.) + is binary. (Set idf and normalization to False to get 0/1 outputs). dtype : type, optional (default=float64) Type of the matrix returned by fit_transform() or transform(). @@ -1621,17 +1657,17 @@ class TfidfVectorizer(CountVectorizer): similarity between two vectors is their dot product when l2 norm has been applied. * 'l1': Sum of absolute values of vector elements is 1. - See :func:`preprocessing.normalize` + See :func:`preprocessing.normalize`. - use_idf : boolean (default=True) + use_idf : bool (default=True) Enable inverse-document-frequency reweighting. - smooth_idf : boolean (default=True) + smooth_idf : bool (default=True) Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions. - sublinear_tf : boolean (default=False) + sublinear_tf : bool (default=False) Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). Attributes @@ -1639,7 +1675,7 @@ class TfidfVectorizer(CountVectorizer): vocabulary_ : dict A mapping of terms to feature indices. - fixed_vocabulary_: boolean + fixed_vocabulary_: bool True if a fixed vocabulary of term to indices mapping is provided by the user @@ -1656,6 +1692,19 @@ class TfidfVectorizer(CountVectorizer): This is only available if no vocabulary was given. + See Also + -------- + CountVectorizer : Transforms text into a sparse matrix of n-gram counts. + + TfidfTransformer : Performs the TF-IDF transformation from a provided + matrix of counts. + + Notes + ----- + The ``stop_words_`` attribute can get large and increase the model size + when pickling. This attribute is provided only for introspection and can + be safely removed using delattr or set to None before pickling. + Examples -------- >>> from sklearn.feature_extraction.text import TfidfVectorizer @@ -1671,19 +1720,6 @@ class TfidfVectorizer(CountVectorizer): ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] >>> print(X.shape) (4, 9) - - See also - -------- - CountVectorizer : Transforms text into a sparse matrix of n-gram counts. - - TfidfTransformer : Performs the TF-IDF transformation from a provided - matrix of counts. - - Notes - ----- - The ``stop_words_`` attribute can get large and increase the model size - when pickling. This attribute is provided only for introspection and can - be safely removed using delattr or set to None before pickling. """ def __init__(self, input='content', encoding='utf-8', @@ -1770,11 +1806,14 @@ def fit(self, raw_documents, y=None): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which yields either str, unicode or file objects. + y : None + This parameter is not needed to compute tfidf. Returns ------- - self : TfidfVectorizer + self : object + Fitted vectorizer. """ self._check_params() self._warn_for_unused_params() @@ -1791,7 +1830,9 @@ def fit_transform(self, raw_documents, y=None): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which yields either str, unicode or file objects. + y : None + This parameter is ignored. Returns ------- @@ -1814,9 +1855,9 @@ def transform(self, raw_documents, copy="deprecated"): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which yields either str, unicode or file objects. - copy : boolean, default True + copy : bool, default True Whether to copy X and operate on the copy or perform in-place operations.