From f6c67e16e4edf26d073e04b93880d04eaf0d3079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Moreyra?= Date: Sat, 26 Jun 2021 15:15:00 -0300 Subject: [PATCH 1/6] Remove TfIdfTransformer from DOCSTRING_IGNORE_LIST. --- maint_tools/test_docstrings.py | 1 - 1 file changed, 1 deletion(-) diff --git a/maint_tools/test_docstrings.py b/maint_tools/test_docstrings.py index cba5e8dfd2900..a8ceb36e9aa7e 100644 --- a/maint_tools/test_docstrings.py +++ b/maint_tools/test_docstrings.py @@ -202,7 +202,6 @@ "StackingRegressor", "StandardScaler", "TSNE", - "TfidfTransformer", "TfidfVectorizer", "TheilSenRegressor", "TransformedTargetRegressor", From e187f3c465df5737120f551c6cdbc5d146070db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Moreyra?= Date: Sat, 26 Jun 2021 15:16:05 -0300 Subject: [PATCH 2/6] Fix numpydocs from TfidfTransformer. --- sklearn/feature_extraction/text.py | 55 ++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 14 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 0ae1956bef555..16a47a79e0426 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1395,7 +1395,7 @@ def _make_int_array(): class TfidfTransformer(TransformerMixin, BaseEstimator): - """Transform a count matrix to a normalized tf or tf-idf representation + """Transform a count matrix to a normalized tf or tf-idf representation. Tf means term-frequency while tf-idf means term-frequency times inverse document-frequency. This is a common term weighting scheme in information @@ -1445,7 +1445,7 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): similarity between two vectors is their dot product when l2 norm has been applied. * 'l1': Sum of absolute values of vector elements is 1. - See :func:`preprocessing.normalize` + See :func:`preprocessing.normalize`. use_idf : bool, default=True Enable inverse-document-frequency reweighting. @@ -1471,6 +1471,26 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): .. versionadded:: 1.0 + See Also + -------- + CountVectorizer : Transforms text into a sparse matrix of n-gram counts. + + TfidfVectorizer : Convert a collection of raw documents to a matrix of + TF-IDF features. + + HashingVectorizer : Convert a collection of text documents to a matrix + of token occurrences. + + References + ---------- + + .. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern + Information Retrieval. Addison Wesley, pp. 68-74. + + .. [MRS2008] C.D. Manning, P. Raghavan and H. Schütze (2008). + Introduction to Information Retrieval. Cambridge University + Press, pp. 118-120. + Examples -------- >>> from sklearn.feature_extraction.text import TfidfTransformer @@ -1495,16 +1515,6 @@ class TfidfTransformer(TransformerMixin, BaseEstimator): 1. , 1.91629073, 1.91629073]) >>> pipe.transform(corpus).shape (4, 8) - - References - ---------- - - .. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern - Information Retrieval. Addison Wesley, pp. 68-74. - - .. [MRS2008] C.D. Manning, P. Raghavan and H. Schütze (2008). - Introduction to Information Retrieval. Cambridge University - Press, pp. 118-120. """ def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False): @@ -1520,6 +1530,14 @@ def fit(self, X, y=None): ---------- X : sparse matrix of shape n_samples, n_features) A matrix of term/token counts. + + y : None + This parameter is not needed to compute tfidf. + + Returns + ------- + self : object + Fitted transformer. """ X = self._validate_data(X, accept_sparse=("csr", "csc")) if not sp.issparse(X): @@ -1549,12 +1567,12 @@ def fit(self, X, y=None): return self def transform(self, X, copy=True): - """Transform a count matrix to a tf or tf-idf representation + """Transform a count matrix to a tf or tf-idf representation. Parameters ---------- X : sparse matrix of (n_samples, n_features) - a matrix of term/token counts + A matrix of term/token counts. copy : bool, default=True Whether to copy X and operate on the copy or perform in-place @@ -1563,6 +1581,7 @@ def transform(self, X, copy=True): Returns ------- vectors : sparse matrix of shape (n_samples, n_features) + Tf-idf-weighted document-term matrix. """ X = self._validate_data( X, accept_sparse="csr", dtype=FLOAT_DTYPES, copy=copy, reset=False @@ -1590,6 +1609,14 @@ def transform(self, X, copy=True): @property def idf_(self): + """Returns the inverse document frecuency (IDF) vector. + + Returns + ------- + idf_ : array of shape (n_features) + The inverse document frequency (IDF) vector; only defined + if ``use_idf`` is True. + """ # if _idf_diag is not set, this will raise an attribute error, # which means hasattr(self, "idf_") is False return np.ravel(self._idf_diag.sum(axis=0)) From 1e6f8a7da070f6d59c7ffd371179d0ccb89fc944 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Moreyra?= Date: Sat, 26 Jun 2021 15:34:18 -0300 Subject: [PATCH 3/6] Update sklearn/feature_extraction/text.py Co-authored-by: Guillaume Lemaitre --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 16a47a79e0426..2ceb96c3fc2a0 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1609,7 +1609,7 @@ def transform(self, X, copy=True): @property def idf_(self): - """Returns the inverse document frecuency (IDF) vector. + """Return the inverse document frecuency (IDF) vector. Returns ------- From 54e1fa3ca43c9f51dc50321033d7cb39260ee68d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Moreyra?= Date: Sat, 26 Jun 2021 15:34:30 -0300 Subject: [PATCH 4/6] Update sklearn/feature_extraction/text.py Co-authored-by: Guillaume Lemaitre --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2ceb96c3fc2a0..5c43821a7288c 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1613,7 +1613,7 @@ def idf_(self): Returns ------- - idf_ : array of shape (n_features) + idf_ : ndarray of shape (n_features,) The inverse document frequency (IDF) vector; only defined if ``use_idf`` is True. """ From cfa95b4ba026e8997a36e01ebb7d118798ffa37c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Moreyra?= Date: Sat, 26 Jun 2021 15:42:10 -0300 Subject: [PATCH 5/6] Update sklearn/feature_extraction/text.py Co-authored-by: Guillaume Lemaitre --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 5c43821a7288c..25a8ddb0dc6aa 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1615,7 +1615,7 @@ def idf_(self): ------- idf_ : ndarray of shape (n_features,) The inverse document frequency (IDF) vector; only defined - if ``use_idf`` is True. + if `use_idf` is True. """ # if _idf_diag is not set, this will raise an attribute error, # which means hasattr(self, "idf_") is False From 4cffd6670036fe133c5989d619ab339ad66d9b15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1s=20Moreyra?= Date: Sat, 26 Jun 2021 15:42:31 -0300 Subject: [PATCH 6/6] Update sklearn/feature_extraction/text.py Co-authored-by: Guillaume Lemaitre --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 25a8ddb0dc6aa..63013d05a2f9d 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1532,7 +1532,7 @@ def fit(self, X, y=None): A matrix of term/token counts. y : None - This parameter is not needed to compute tfidf. + This parameter is not needed to compute tf-idf. Returns -------