From f0314ae3bdef704edda58c129b7baca7306a79f9 Mon Sep 17 00:00:00 2001 From: Yulan Lin Date: Sat, 15 Jul 2017 15:26:28 -0500 Subject: [PATCH 1/4] edited tfidfvectorizer and tfidftransformer docstrings --- sklearn/feature_extraction/text.py | 70 +++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index fa7306ab9def5..266d4a6474cc7 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -971,24 +971,40 @@ def _make_int_array(): class TfidfTransformer(BaseEstimator, TransformerMixin): """Transform a count matrix to a normalized tf or tf-idf representation - Tf means term-frequency while tf-idf means term-frequency times inverse - document-frequency. This is a common term weighting scheme in information - retrieval, that has also found good use in document classification. + Definitions: - The goal of using tf-idf instead of the raw frequencies of occurrence of a - token in a given document is to scale down the impact of tokens that occur - very frequently in a given corpus and that are hence empirically less - informative than features that occur in a small fraction of the training - corpus. + Term-frequency (tf) = the frequency of a term in a document. + + Inverse-document-frequency (idf) = 1 / (# of docs a term appears in) + + This is a common term weighting scheme in information + retrieval, also commonly used in document classification. + + Using the tf-idf transformed frequencies allows the raw term counts of + a term occurring in a document to be weighted by the frequency of the + term occurring in other documents in the corpus. Thus, terms that occur in + many documents in a corpus will be down-weighted, and terms that occur in + few documents in a corpus will be up-weighted. Frequently occurring terms + are empirically less informative than features that occur in a small + fraction of the training corpus. The formula that is used to compute the tf-idf of term t is - tf-idf(d, t) = tf(t) * idf(d, t), and the idf is computed as - idf(d, t) = log [ n / df(d, t) ] + 1 (if ``smooth_idf=False``), - where n is the total number of documents and df(d, t) is the - document frequency; the document frequency is the number of documents d - that contain term t. The effect of adding "1" to the idf in the equation - above is that terms with zero idf, i.e., terms that occur in all documents + + tf-idf(d, t, D) = tf(t, d) * idf(t, D) + + Where `d` represents a document, `t` represents a term, `D` is the + collection of documents, `tf(t, d)` is the frequency of term `t` in + document `d`, and `idf(t, D)` is the inverse of the frequency of term `t` + in the corpus `D`. + + The idf is computed as: + + idf(d, t) = log(n/df(D, t)) + 1 + + The effect of adding "1" to the idf in the equation + above is that terms with an idf=1, i.e., terms that occur in all documents in a training set, will not be entirely ignored. + (Note that the idf formula above differs from the standard textbook notation that defines the idf as idf(d, t) = log [ n / (df(d, t) + 1) ]). @@ -996,7 +1012,9 @@ class TfidfTransformer(BaseEstimator, TransformerMixin): If ``smooth_idf=True`` (the default), the constant "1" is added to the numerator and denominator of the idf as if an extra document was seen containing every term in the collection exactly once, which prevents - zero divisions: idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1. + zero divisions: + + idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1. Furthermore, the formulas used to compute tf and idf depend on parameter settings that correspond to the SMART notation used in IR @@ -1008,6 +1026,8 @@ class TfidfTransformer(BaseEstimator, TransformerMixin): Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) when ``norm=None``. + By default, the l2 norm is used. + Read more in the :ref:`User Guide `. Parameters @@ -1126,8 +1146,26 @@ class TfidfVectorizer(CountVectorizer): """Convert a collection of raw documents to a matrix of TF-IDF features. Equivalent to CountVectorizer followed by TfidfTransformer. + CountVectorizer takes a collection of raw documents + and transforms it into a term_document matrix with the shape + [n_samples, n_features], or more intuitively, the shape [n_docs, n_terms], + and each item of the matrix is the count of a word in a given documnet. - Read more in the :ref:`User Guide `. + Using the tf-idf transformed frequencies allows the raw term counts of + a term occurring in a document to be weighted by the frequency of the + term occurring in other documents in the corpus. Thus, terms that occur in + many documents in a corpus will be down-weighted, and terms that occur in + few documents in a corpus will be up-weighted. Frequently occurring terms + are empirically less informative than features that occur in a small + fraction of the training corpus. + + Definitions: + + * Term-frequency (tf): the frequency of a term in a document. + * Inverse-document-frequency (idf): 1 / (# of docs a term appears in) + + + Read more in the :py:class:`TfidfTransformer` docs. Parameters ---------- From c9428f44f90792f1e38135005125a77651a124a7 Mon Sep 17 00:00:00 2001 From: Yulan Lin Date: Sat, 15 Jul 2017 16:24:55 -0500 Subject: [PATCH 2/4] Added a brief explanation of the normed parameter, expanded TfidfVectorizer docstring --- sklearn/feature_extraction/text.py | 40 ++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 266d4a6474cc7..c7abc850188f4 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1023,11 +1023,12 @@ class TfidfTransformer(BaseEstimator, TransformerMixin): Tf is "n" (natural) by default, "l" (logarithmic) when ``sublinear_tf=True``. Idf is "t" when use_idf is given, "n" (none) otherwise. + The "norm" parameter provides the "how" of how each input vector is + scaled during normalization: the default value is "l2." + Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) when ``norm=None``. - By default, the l2 norm is used. - Read more in the :ref:`User Guide `. Parameters @@ -1165,8 +1166,43 @@ class TfidfVectorizer(CountVectorizer): * Inverse-document-frequency (idf): 1 / (# of docs a term appears in) + The formula that is used to compute the tf-idf of term t is + + tf-idf(d, t, D) = tf(t, d) * idf(t, D) + + Where `d` represents a document, `t` represents a term, `D` is the + collection of documents, `tf(t, d)` is the frequency of term `t` in + document `d`, and `idf(t, D)` is the inverse of the frequency of term `t` + in the corpus `D`. + + The idf is computed as: + + idf(d, t) = log(n/df(D, t)) + 1 + + If ``smooth_idf=True`` (the default), the constant "1" is added to the + numerator and denominator of the idf as if an extra document was seen + containing every term in the collection exactly once, which prevents + zero divisions: + + idf(d, t) = log [ (1 + n) / (1 + df(d, t)) ] + 1. + + Furthermore, the formulas used to compute tf and idf depend + on parameter settings that correspond to the SMART notation used in IR + as follows: + + Tf is "n" (natural) by default, "l" (logarithmic) when + ``sublinear_tf=True``. + Idf is "t" when use_idf is given, "n" (none) otherwise. + + The "norm" parameter provides the "how" of how each input vector is + scaled during normalization: the default value is "l2." + + Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) + when ``norm=None``. + Read more in the :py:class:`TfidfTransformer` docs. + Parameters ---------- input : string {'filename', 'file', 'content'} From e3fae5fd37907e9e52444c7e7bcee0356c775091 Mon Sep 17 00:00:00 2001 From: Yulan Lin Date: Sat, 15 Jul 2017 16:35:09 -0500 Subject: [PATCH 3/4] moved norm default comment to the parameters section of the docstring --- sklearn/feature_extraction/text.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index c7abc850188f4..914f9d3b375f7 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1023,8 +1023,6 @@ class TfidfTransformer(BaseEstimator, TransformerMixin): Tf is "n" (natural) by default, "l" (logarithmic) when ``sublinear_tf=True``. Idf is "t" when use_idf is given, "n" (none) otherwise. - The "norm" parameter provides the "how" of how each input vector is - scaled during normalization: the default value is "l2." Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) when ``norm=None``. @@ -1033,7 +1031,7 @@ class TfidfTransformer(BaseEstimator, TransformerMixin): Parameters ---------- - norm : 'l1', 'l2' or None, optional + norm : 'l1', 'l2' or None, optional, default = 'l2' Norm used to normalize term vectors. None for no normalization. use_idf : boolean, default=True @@ -1194,9 +1192,6 @@ class TfidfVectorizer(CountVectorizer): ``sublinear_tf=True``. Idf is "t" when use_idf is given, "n" (none) otherwise. - The "norm" parameter provides the "how" of how each input vector is - scaled during normalization: the default value is "l2." - Normalization is "c" (cosine) when ``norm='l2'``, "n" (none) when ``norm=None``. @@ -1310,7 +1305,7 @@ class TfidfVectorizer(CountVectorizer): dtype : type, optional Type of the matrix returned by fit_transform() or transform(). - norm : 'l1', 'l2' or None, optional + norm : 'l1', 'l2' or None, optional, default = 'l2' Norm used to normalize term vectors. None for no normalization. use_idf : boolean, default=True From 1d224a5de8a37dd1dbc163d1128c73e999244539 Mon Sep 17 00:00:00 2001 From: Yulan Lin Date: Sun, 23 Jul 2017 15:38:00 -0500 Subject: [PATCH 4/4] minor edit to default value i ndocstring --- sklearn/feature_extraction/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 914f9d3b375f7..72f889a79665a 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1031,7 +1031,7 @@ class TfidfTransformer(BaseEstimator, TransformerMixin): Parameters ---------- - norm : 'l1', 'l2' or None, optional, default = 'l2' + norm : 'l1', 'l2' or None, optional, default='l2' Norm used to normalize term vectors. None for no normalization. use_idf : boolean, default=True @@ -1305,7 +1305,7 @@ class TfidfVectorizer(CountVectorizer): dtype : type, optional Type of the matrix returned by fit_transform() or transform(). - norm : 'l1', 'l2' or None, optional, default = 'l2' + norm : 'l1', 'l2' or None, optional, default='l2' Norm used to normalize term vectors. None for no normalization. use_idf : boolean, default=True