From fa8555815107cb8b0171b548e70e9a052a44f19b Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sat, 2 Nov 2019 14:49:05 -0700 Subject: [PATCH 01/10] update docstrings for TfidfVectorizer functions --- sklearn/feature_extraction/text.py | 129 +++++++++++++++++++---------- 1 file changed, 84 insertions(+), 45 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index dd293531184c3..4a333dbfdf558 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -123,7 +123,7 @@ def strip_accents_unicode(s): s : string The string to strip - See also + See Also -------- strip_accents_ascii Remove accentuated char for any unicode symbol that has a direct @@ -150,7 +150,7 @@ def strip_accents_ascii(s): s : string The string to strip - See also + See Also -------- strip_accents_unicode Remove accentuated char for any unicode symbol. @@ -190,14 +190,19 @@ class _VectorizerMixin: _white_spaces = re.compile(r"\s\s+") def decode(self, doc): - """Decode the input into a string of unicode symbols + """Decode the input into a string of unicode symbols. The decoding strategy depends on the vectorizer parameters. Parameters ---------- - doc : string - The string to decode + doc : str + The string to decode. + + Returns + ------- + doc: str + A string of unicode symbols. """ if self.input == 'filename': with open(doc, 'rb') as fh: @@ -298,7 +303,13 @@ def _char_wb_ngrams(self, text_document): return ngrams def build_preprocessor(self): - """Return a function to preprocess the text before tokenization""" + """Return a function to preprocess the text before tokenization. + + Returns + ------- + _preprocess: object + A function to preprocess the text before tokenization. + """ if self.preprocessor is not None: return self.preprocessor @@ -320,14 +331,26 @@ def build_preprocessor(self): ) def build_tokenizer(self): - """Return a function that splits a string into a sequence of tokens""" + """Return a function that splits a string into a sequence of tokens. + + Returns + ------- + tokenizer: object + A function to split a string into a sequence of tokens. + """ if self.tokenizer is not None: return self.tokenizer token_pattern = re.compile(self.token_pattern) return token_pattern.findall def get_stop_words(self): - """Build or fetch the effective stop words list""" + """Build or fetch the effective stop words list. + + Returns + ------- + stop_words: list or None + A list of stop words. + """ return _check_stop_list(self.stop_words) def _check_stop_words_consistency(self, stop_words, preprocess, tokenize): @@ -390,9 +413,13 @@ def _validate_custom_analyzer(self): pass def build_analyzer(self): - """Return a callable that handles preprocessing, tokenization + """ + Return a callable that handles preprocessing, tokenization and n-grams generation. - and n-grams generation. + Returns + ------- + _analyze: object + A function to handle preprocessing, tokenization and n-grams generation. """ if callable(self.analyzer): @@ -667,7 +694,7 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): >>> print(X.shape) (4, 16) - See also + See Also -------- CountVectorizer, TfidfVectorizer @@ -970,7 +997,7 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): [1 0 0 1 1 0 1 1 1] [0 1 1 1 0 0 1 0 1]] - See also + See Also -------- HashingVectorizer, TfidfVectorizer @@ -1237,6 +1264,7 @@ def inverse_transform(self, X): Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) + Document-term matrix. Returns ------- @@ -1262,7 +1290,13 @@ def inverse_transform(self, X): for i in range(n_samples)] def get_feature_names(self): - """Array mapping from feature integer indices to feature name""" + """Array mapping from feature integer indices to feature name. + + Returns + ------- + feature_names : list + A list of feature name. + """ self._check_vocabulary() @@ -1492,7 +1526,7 @@ class TfidfVectorizer(CountVectorizer): Parameters ---------- - input : string {'filename', 'file', 'content'} + input : str {'filename', 'file', 'content'} If 'filename', the sequence passed as an argument to fit is expected to be a list of filenames that need reading to fetch the raw content to analyze. @@ -1503,7 +1537,7 @@ class TfidfVectorizer(CountVectorizer): Otherwise the input is expected to be a sequence of items that can be of type string or byte. - encoding : string, 'utf-8' by default. + encoding : str, 'utf-8' by default If bytes or files are given to analyze, this encoding is used to decode. @@ -1524,7 +1558,7 @@ class TfidfVectorizer(CountVectorizer): Both 'ascii' and 'unicode' use NFKD normalization from :func:`unicodedata.normalize`. - lowercase : boolean (default=True) + lowercase : bool (default=True) Convert all characters to lowercase before tokenizing. preprocessor : callable or None (default=None) @@ -1537,7 +1571,7 @@ class TfidfVectorizer(CountVectorizer): preprocessing and n-grams generation steps. Only applies if ``analyzer == 'word'``. - analyzer : string, {'word', 'char', 'char_wb'} or callable + analyzer : str, {'word', 'char', 'char_wb'} or callable Whether the feature should be made of word or character n-grams. Option 'char_wb' creates character n-grams only from text inside word boundaries; n-grams at the edges of words are padded with space. @@ -1551,7 +1585,7 @@ class TfidfVectorizer(CountVectorizer): first read from the file and then passed to the given callable analyzer. - stop_words : string {'english'}, list, or None (default=None) + stop_words : str {'english'}, list, or None (default=None) If a string, it is passed to _check_stop_list and the appropriate stop list is returned. 'english' is currently the only supported string value. @@ -1566,7 +1600,7 @@ class TfidfVectorizer(CountVectorizer): in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms. - token_pattern : string + token_pattern : str Regular expression denoting what constitutes a "token", only used if ``analyzer == 'word'``. The default regexp selects tokens of 2 or more alphanumeric characters (punctuation is completely ignored @@ -1607,10 +1641,10 @@ class TfidfVectorizer(CountVectorizer): indices in the feature matrix, or an iterable over terms. If not given, a vocabulary is determined from the input documents. - binary : boolean (default=False) + binary : bool (default=False) If True, all non-zero term counts are set to 1. This does not mean outputs will have only 0/1 values, only that the tf term in tf-idf - is binary. (Set idf and normalization to False to get 0/1 outputs.) + is binary. (Set idf and normalization to False to get 0/1 outputs). dtype : type, optional (default=float64) Type of the matrix returned by fit_transform() or transform(). @@ -1621,17 +1655,17 @@ class TfidfVectorizer(CountVectorizer): similarity between two vectors is their dot product when l2 norm has been applied. * 'l1': Sum of absolute values of vector elements is 1. - See :func:`preprocessing.normalize` + See :func:`preprocessing.normalize`. - use_idf : boolean (default=True) + use_idf : bool (default=True) Enable inverse-document-frequency reweighting. - smooth_idf : boolean (default=True) + smooth_idf : bool (default=True) Smooth idf weights by adding one to document frequencies, as if an extra document was seen containing every term in the collection exactly once. Prevents zero divisions. - sublinear_tf : boolean (default=False) + sublinear_tf : bool (default=False) Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf). Attributes @@ -1639,7 +1673,7 @@ class TfidfVectorizer(CountVectorizer): vocabulary_ : dict A mapping of terms to feature indices. - fixed_vocabulary_: boolean + fixed_vocabulary_: bool True if a fixed vocabulary of term to indices mapping is provided by the user @@ -1655,7 +1689,20 @@ class TfidfVectorizer(CountVectorizer): - were cut off by feature selection (`max_features`). This is only available if no vocabulary was given. + + See Also + -------- + CountVectorizer : Transforms text into a sparse matrix of n-gram counts. + TfidfTransformer : Performs the TF-IDF transformation from a provided + matrix of counts. + + Notes + ----- + The ``stop_words_`` attribute can get large and increase the model size + when pickling. This attribute is provided only for introspection and can + be safely removed using delattr or set to None before pickling. + Examples -------- >>> from sklearn.feature_extraction.text import TfidfVectorizer @@ -1670,20 +1717,7 @@ class TfidfVectorizer(CountVectorizer): >>> print(vectorizer.get_feature_names()) ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] >>> print(X.shape) - (4, 9) - - See also - -------- - CountVectorizer : Transforms text into a sparse matrix of n-gram counts. - - TfidfTransformer : Performs the TF-IDF transformation from a provided - matrix of counts. - - Notes - ----- - The ``stop_words_`` attribute can get large and increase the model size - when pickling. This attribute is provided only for introspection and can - be safely removed using delattr or set to None before pickling. + (4, 9) """ def __init__(self, input='content', encoding='utf-8', @@ -1770,11 +1804,14 @@ def fit(self, raw_documents, y=None): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which has either str, unicode or file objects. + y : None + This parameter is not needed to compute tfidf. Returns ------- - self : TfidfVectorizer + self : object + TfidfVectorizer. """ self._check_params() self._warn_for_unused_params() @@ -1791,7 +1828,9 @@ def fit_transform(self, raw_documents, y=None): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which has either str, unicode or file objects. + y : None + This parameter is not needed to compute tfidf. Returns ------- @@ -1814,9 +1853,9 @@ def transform(self, raw_documents, copy="deprecated"): Parameters ---------- raw_documents : iterable - an iterable which yields either str, unicode or file objects + An iterable which has either str, unicode or file objects. - copy : boolean, default True + copy : bool, default True Whether to copy X and operate on the copy or perform in-place operations. From 1a5ed9cd4144178a1a7ffedcbf37908a0178fae9 Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sat, 2 Nov 2019 15:32:10 -0700 Subject: [PATCH 02/10] fixed some formatting errors --- sklearn/feature_extraction/text.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 4a333dbfdf558..58960be54eb29 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -304,7 +304,7 @@ def _char_wb_ngrams(self, text_document): def build_preprocessor(self): """Return a function to preprocess the text before tokenization. - + Returns ------- _preprocess: object @@ -332,7 +332,7 @@ def build_preprocessor(self): def build_tokenizer(self): """Return a function that splits a string into a sequence of tokens. - + Returns ------- tokenizer: object @@ -345,11 +345,11 @@ def build_tokenizer(self): def get_stop_words(self): """Build or fetch the effective stop words list. - + Returns ------- stop_words: list or None - A list of stop words. + A list of stop words. """ return _check_stop_list(self.stop_words) @@ -414,12 +414,14 @@ def _validate_custom_analyzer(self): def build_analyzer(self): """ - Return a callable that handles preprocessing, tokenization and n-grams generation. + Return a callable that handles preprocessing, tokenization + and n-grams generation. Returns ------- _analyze: object - A function to handle preprocessing, tokenization and n-grams generation. + A function to handle preprocessing, tokenization + and n-grams generation. """ if callable(self.analyzer): @@ -699,6 +701,7 @@ class HashingVectorizer(TransformerMixin, _VectorizerMixin, BaseEstimator): CountVectorizer, TfidfVectorizer """ + def __init__(self, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, @@ -1291,11 +1294,11 @@ def inverse_transform(self, X): def get_feature_names(self): """Array mapping from feature integer indices to feature name. - + Returns ------- feature_names : list - A list of feature name. + A list of feature name. """ self._check_vocabulary() @@ -1689,20 +1692,20 @@ class TfidfVectorizer(CountVectorizer): - were cut off by feature selection (`max_features`). This is only available if no vocabulary was given. - + See Also -------- CountVectorizer : Transforms text into a sparse matrix of n-gram counts. TfidfTransformer : Performs the TF-IDF transformation from a provided matrix of counts. - + Notes ----- The ``stop_words_`` attribute can get large and increase the model size when pickling. This attribute is provided only for introspection and can be safely removed using delattr or set to None before pickling. - + Examples -------- >>> from sklearn.feature_extraction.text import TfidfVectorizer @@ -1717,7 +1720,7 @@ class TfidfVectorizer(CountVectorizer): >>> print(vectorizer.get_feature_names()) ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] >>> print(X.shape) - (4, 9) + (4, 9) """ def __init__(self, input='content', encoding='utf-8', From 47aa30dc2355d429a2d5ed64e59362641db0c451 Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sun, 3 Nov 2019 19:41:19 -0800 Subject: [PATCH 03/10] Update sklearn/feature_extraction/text.py Co-Authored-By: Roman Yurchak --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 58960be54eb29..538df85257d36 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -307,7 +307,7 @@ def build_preprocessor(self): Returns ------- - _preprocess: object + preprocessor: callable A function to preprocess the text before tokenization. """ if self.preprocessor is not None: From fb41ad4a62deeddedc38b6cf180afb5a1d184bba Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sun, 3 Nov 2019 19:41:31 -0800 Subject: [PATCH 04/10] Update sklearn/feature_extraction/text.py Co-Authored-By: Roman Yurchak --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 538df85257d36..c2aeef2f2dfa6 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -335,7 +335,7 @@ def build_tokenizer(self): Returns ------- - tokenizer: object + tokenizer: callable A function to split a string into a sequence of tokens. """ if self.tokenizer is not None: From 0eb2d55138f9d8dab49d8b1f17d2277c10dda34c Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sun, 3 Nov 2019 19:41:39 -0800 Subject: [PATCH 05/10] Update sklearn/feature_extraction/text.py Co-Authored-By: Roman Yurchak --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index c2aeef2f2dfa6..c344b22a2bf07 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -419,7 +419,7 @@ def build_analyzer(self): Returns ------- - _analyze: object + analyzer: callable A function to handle preprocessing, tokenization and n-grams generation. """ From caf3628d1eb1c2a6e64f7566e3141ab6fe66c258 Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sun, 3 Nov 2019 19:42:04 -0800 Subject: [PATCH 06/10] Update sklearn/feature_extraction/text.py Co-Authored-By: Roman Yurchak --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index c344b22a2bf07..20890c5df27b1 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1540,7 +1540,7 @@ class TfidfVectorizer(CountVectorizer): Otherwise the input is expected to be a sequence of items that can be of type string or byte. - encoding : str, 'utf-8' by default + encoding : str, default='utf-8' If bytes or files are given to analyze, this encoding is used to decode. From a53f2d8f3b13d125c928d087afb5177395a6e27a Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sun, 3 Nov 2019 19:42:34 -0800 Subject: [PATCH 07/10] Update sklearn/feature_extraction/text.py Co-Authored-By: Roman Yurchak --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 20890c5df27b1..c030a0f6a1f9b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1814,7 +1814,7 @@ def fit(self, raw_documents, y=None): Returns ------- self : object - TfidfVectorizer. + Fitted vectorizer. """ self._check_params() self._warn_for_unused_params() From 931b7ef5897f51d3253e017ef68194dfc070f9cb Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sun, 3 Nov 2019 19:42:47 -0800 Subject: [PATCH 08/10] Update sklearn/feature_extraction/text.py Co-Authored-By: Roman Yurchak --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index c030a0f6a1f9b..e04e232f431bf 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1833,7 +1833,7 @@ def fit_transform(self, raw_documents, y=None): raw_documents : iterable An iterable which has either str, unicode or file objects. y : None - This parameter is not needed to compute tfidf. + This parameter is ignored. Returns ------- From 8783f1bbfe8bd749c2c415060b5e17a489484a7b Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Sun, 3 Nov 2019 19:47:34 -0800 Subject: [PATCH 09/10] Updated some formatting issues --- sklearn/feature_extraction/text.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index e04e232f431bf..c1edf3eb1ed71 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -413,8 +413,7 @@ def _validate_custom_analyzer(self): pass def build_analyzer(self): - """ - Return a callable that handles preprocessing, tokenization + """Return a callable that handles preprocessing, tokenization and n-grams generation. Returns @@ -1298,7 +1297,7 @@ def get_feature_names(self): Returns ------- feature_names : list - A list of feature name. + A list of feature name. """ self._check_vocabulary() @@ -1807,7 +1806,7 @@ def fit(self, raw_documents, y=None): Parameters ---------- raw_documents : iterable - An iterable which has either str, unicode or file objects. + An iterable which yields either str, unicode or file objects. y : None This parameter is not needed to compute tfidf. @@ -1831,7 +1830,7 @@ def fit_transform(self, raw_documents, y=None): Parameters ---------- raw_documents : iterable - An iterable which has either str, unicode or file objects. + An iterable which yields either str, unicode or file objects. y : None This parameter is ignored. @@ -1856,7 +1855,7 @@ def transform(self, raw_documents, copy="deprecated"): Parameters ---------- raw_documents : iterable - An iterable which has either str, unicode or file objects. + An iterable which yields either str, unicode or file objects. copy : bool, default True Whether to copy X and operate on the copy or perform in-place From eb01c8b7ddf86a04bfe5339069b26d363bc6e8ef Mon Sep 17 00:00:00 2001 From: Hailey Nguyen Date: Tue, 5 Nov 2019 22:15:54 -0800 Subject: [PATCH 10/10] Update sklearn/feature_extraction/text.py Co-Authored-By: Roman Yurchak --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index c1edf3eb1ed71..5f43d78f48c82 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1297,7 +1297,7 @@ def get_feature_names(self): Returns ------- feature_names : list - A list of feature name. + A list of feature names. """ self._check_vocabulary()