From 60635b9385049ed30faf19a6c31a3525a1992511 Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <vkapoor3141+amazon@gmail.com>
Date: Sun, 27 Jan 2019 14:02:59 -0800
Subject: [PATCH 1/4] Correct TF-IDF formula in TfidfTransformer comments.

The existing formula has some typos involving the arguments of tf and
idf. Corrected the formula as per the Wikipedia article for TFIDF: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
---
 sklearn/feature_extraction/text.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index d705a060e7588..cdd0027074a0b 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1146,17 +1146,17 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
     informative than features that occur in a small fraction of the training
     corpus.
 
-    The formula that is used to compute the tf-idf of term t is
-    tf-idf(d, t) = tf(t) * idf(d, t), and the idf is computed as
-    idf(d, t) = log [ n / df(d, t) ] + 1 (if ``smooth_idf=False``),
-    where n is the total number of documents and df(d, t) is the
-    document frequency; the document frequency is the number of documents d
-    that contain term t. The effect of adding "1" to the idf in the equation
-    above is that terms with zero idf, i.e., terms  that occur in all documents
-    in a training set, will not be entirely ignored.
-    (Note that the idf formula above differs from the standard
-    textbook notation that defines the idf as
-    idf(d, t) = log [ n / (df(d, t) + 1) ]).
+    The formula that is used to compute the tf-idf for a term t of a document d
+    in a document set D is tf-idf(t, d, D) = tf(t, d) * idf(t, D), and the idf
+    is computed as idf(t, D) = log [ |D| / df(t, D) ] + 1 (if
+    ``smooth_idf=False``), where df(t, D) is the document frequency; the
+    document frequency is the number of documents in D that contain the term t.
+    The effect of adding "1" to the idf in the equation above is that terms
+    with zero idf, i.e., terms  that occur in all documents in a training set,
+    will not be entirely ignored.
+    (Note that the idf formula above differs from the standard textbook
+    notation that defines the idf as
+    idf(t, D) = log [ |D| / (df(t, D) + 1) ]).
 
     If ``smooth_idf=True`` (the default), the constant "1" is added to the
     numerator and denominator of the idf as if an extra document was seen

From 9424115cd8a0ed19a414818495247953ebe57882 Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <vkapoor3141+amazon@gmail.com>
Date: Mon, 28 Jan 2019 11:11:32 -0800
Subject: [PATCH 2/4] Remove document set D as it is pedantic.

---
 sklearn/feature_extraction/text.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index cdd0027074a0b..0f5545b94306c 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1147,16 +1147,16 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
     corpus.
 
     The formula that is used to compute the tf-idf for a term t of a document d
-    in a document set D is tf-idf(t, d, D) = tf(t, d) * idf(t, D), and the idf
-    is computed as idf(t, D) = log [ |D| / df(t, D) ] + 1 (if
-    ``smooth_idf=False``), where df(t, D) is the document frequency; the
-    document frequency is the number of documents in D that contain the term t.
-    The effect of adding "1" to the idf in the equation above is that terms
-    with zero idf, i.e., terms  that occur in all documents in a training set,
-    will not be entirely ignored.
+    tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [
+    n / df(t) ] + 1 (if ``smooth_idf=False``), where n is the total number of
+    documents and df(t) is the document frequency of t; the document frequency
+    is the number of documents that contain the term t. The effect of adding
+    "1" to the idf in the equation above is that terms with zero idf, i.e.,
+    terms  that occur in all documents in a training set, will not be entirely
+    ignored.
     (Note that the idf formula above differs from the standard textbook
     notation that defines the idf as
-    idf(t, D) = log [ |D| / (df(t, D) + 1) ]).
+    idf(t) = log [ n / (df(t) + 1) ]).
 
     If ``smooth_idf=True`` (the default), the constant "1" is added to the
     numerator and denominator of the idf as if an extra document was seen

From b3aa841c59b37421e23da23bb4d7d708c8c28b29 Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <vkapoor3141+amazon@gmail.com>
Date: Mon, 28 Jan 2019 16:56:44 -0800
Subject: [PATCH 3/4] Language to introduce document set as the context.

---
 sklearn/feature_extraction/text.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 0f5545b94306c..d06e4c7fd483e 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1147,12 +1147,13 @@ class TfidfTransformer(BaseEstimator, TransformerMixin):
     corpus.
 
     The formula that is used to compute the tf-idf for a term t of a document d
-    tf-idf(t, d) = tf(t, d) * idf(t), and the idf is computed as idf(t) = log [
-    n / df(t) ] + 1 (if ``smooth_idf=False``), where n is the total number of
-    documents and df(t) is the document frequency of t; the document frequency
-    is the number of documents that contain the term t. The effect of adding
-    "1" to the idf in the equation above is that terms with zero idf, i.e.,
-    terms  that occur in all documents in a training set, will not be entirely
+    in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is
+    computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where
+    n is the total number of documents in the document set and df(t) is the
+    document frequency of t; the document frequency is the number of documents
+    in the document set that contain the term t. The effect of adding "1" to
+    the idf in the equation above is that terms with zero idf, i.e., terms
+    that occur in all documents in a training set, will not be entirely
     ignored.
     (Note that the idf formula above differs from the standard textbook
     notation that defines the idf as

From 566cf77f65978069807f02d423a042435777ba4c Mon Sep 17 00:00:00 2001
From: Vishaal Kapoor <vkapoor3141+amazon@gmail.com>
Date: Mon, 28 Jan 2019 17:46:34 -0800
Subject: [PATCH 4/4] Corrections to the feature extraction documentation.

---
 doc/modules/feature_extraction.rst | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
index 2f506dcf7be07..9dfcaa4549f08 100644
--- a/doc/modules/feature_extraction.rst
+++ b/doc/modules/feature_extraction.rst
@@ -436,11 +436,12 @@ Using the ``TfidfTransformer``'s default settings,
 the term frequency, the number of times a term occurs in a given document,
 is multiplied with idf component, which is computed as
 
-:math:`\text{idf}(t) = log{\frac{1 + n_d}{1+\text{df}(d,t)}} + 1`,
+:math:`\text{idf}(t) = \log{\frac{1 + n}{1+\text{df}(t)}} + 1`,
 
-where :math:`n_d` is the total number of documents, and :math:`\text{df}(d,t)`
-is the number of documents that contain term :math:`t`. The resulting tf-idf
-vectors are then normalized by the Euclidean norm:
+where :math:`n` is the total number of documents in the document set, and
+:math:`\text{df}(t)` is the number of documents in the document set that
+contain term :math:`t`. The resulting tf-idf vectors are then normalized by the
+Euclidean norm:
 
 :math:`v_{norm} = \frac{v}{||v||_2} = \frac{v}{\sqrt{v{_1}^2 +
 v{_2}^2 + \dots + v{_n}^2}}`.
@@ -455,14 +456,14 @@ computed in scikit-learn's :class:`TfidfTransformer`
 and :class:`TfidfVectorizer` differ slightly from the standard textbook
 notation that defines the idf as
 
-:math:`\text{idf}(t) = log{\frac{n_d}{1+\text{df}(d,t)}}.`
+:math:`\text{idf}(t) = \log{\frac{n}{1+\text{df}(t)}}.`
 
 
 In the :class:`TfidfTransformer` and :class:`TfidfVectorizer`
 with ``smooth_idf=False``, the
 "1" count is added to the idf instead of the idf's denominator:
 
-:math:`\text{idf}(t) = log{\frac{n_d}{\text{df}(d,t)}} + 1`
+:math:`\text{idf}(t) = \log{\frac{n}{\text{df}(t)}} + 1`
 
 This normalization is implemented by the :class:`TfidfTransformer`
 class::
@@ -509,21 +510,21 @@ v{_2}^2 + \dots + v{_n}^2}}`
 For example, we can compute the tf-idf of the first term in the first
 document in the `counts` array as follows:
 
-:math:`n_{d} = 6`
+:math:`n = 6`
 
-:math:`\text{df}(d, t)_{\text{term1}} = 6`
+:math:`\text{df}(t)_{\text{term1}} = 6`
 
-:math:`\text{idf}(d, t)_{\text{term1}} =
-log \frac{n_d}{\text{df}(d, t)} + 1 = log(1)+1 = 1`
+:math:`\text{idf}(t)_{\text{term1}} =
+\log \frac{n}{\text{df}(t)} + 1 = \log(1)+1 = 1`
 
 :math:`\text{tf-idf}_{\text{term1}} = \text{tf} \times \text{idf} = 3 \times 1 = 3`
 
 Now, if we repeat this computation for the remaining 2 terms in the document,
 we get
 
-:math:`\text{tf-idf}_{\text{term2}} = 0 \times (log(6/1)+1) = 0`
+:math:`\text{tf-idf}_{\text{term2}} = 0 \times (\log(6/1)+1) = 0`
 
-:math:`\text{tf-idf}_{\text{term3}} = 1 \times (log(6/2)+1) \approx 2.0986`
+:math:`\text{tf-idf}_{\text{term3}} = 1 \times (\log(6/2)+1) \approx 2.0986`
 
 and the vector of raw tf-idfs:
 
@@ -540,12 +541,12 @@ Furthermore, the default parameter ``smooth_idf=True`` adds "1" to the numerator
 and  denominator as if an extra document was seen containing every term in the
 collection exactly once, which prevents zero divisions:
 
-:math:`\text{idf}(t) = log{\frac{1 + n_d}{1+\text{df}(d,t)}} + 1`
+:math:`\text{idf}(t) = \log{\frac{1 + n}{1+\text{df}(t)}} + 1`
 
 Using this modification, the tf-idf of the third term in document 1 changes to
 1.8473:
 
-:math:`\text{tf-idf}_{\text{term3}} = 1 \times log(7/3)+1 \approx 1.8473`
+:math:`\text{tf-idf}_{\text{term3}} = 1 \times \log(7/3)+1 \approx 1.8473`
 
 And the L2-normalized tf-idf changes to