From 937666a4fd4bda4fc0b119b7446d30d28f57c9c7 Mon Sep 17 00:00:00 2001
From: Alek Lefebvre <info@aleklefebvre.ca>
Date: Sun, 15 Aug 2021 14:05:00 -0400
Subject: [PATCH 1/6] Throw error when min_idf or max_idf is float and > 1

---
 sklearn/feature_extraction/tests/test_text.py | 22 +++++++++++++++++++
 sklearn/feature_extraction/text.py            | 10 +++++++++
 2 files changed, 32 insertions(+)
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 5b09325c61031..3cb20d6bb20f7 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -805,6 +805,28 @@ def test_vectorizer_min_df():
     assert len(vect.stop_words_) == 5
 
 
+def test_vectorizer_max_df_unwanted_float():
+    message = (
+        "max_df is out of range for a float value. Use an int or a float between 0.0"
+        " and 1.0."
+    )
+    with pytest.raises(ValueError, match=message):
+        test_data = ["abc", "dea", "eat"]
+        vect = CountVectorizer(analyzer="char", max_df=2.0)
+        vect.fit(test_data)
+
+
+def test_vectorizer_min_df_unwanted_float():
+    message = (
+        "min_df is out of range for a float value. Use an int or a float between 0.0"
+        " and 1.0."
+    )
+    with pytest.raises(ValueError, match=message):
+        test_data = ["abc", "dea", "eat"]
+        vect = CountVectorizer(analyzer="char", min_df=1.5)
+        vect.fit(test_data)
+
+
 def test_count_binary_occurrences():
     # by default multiple occurrences are counted as longs
     test_data = ["aaabc", "abbde"]
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 24d9b07023fd3..d98386d03e209 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1103,6 +1103,16 @@ def __init__(
         self.min_df = min_df
         if max_df < 0 or min_df < 0:
             raise ValueError("negative value for max_df or min_df")
+        if not isinstance(min_df, numbers.Integral) and min_df > 1.0:
+            raise ValueError(
+                "min_df is out of range for a float value. Use an int or a float"
+                " between 0.0 and 1.0."
+            )
+        if not isinstance(max_df, numbers.Integral) and max_df > 1.0:
+            raise ValueError(
+                "max_df is out of range for a float value. Use an int or a float"
+                " between 0.0 and 1.0."
+            )
         self.max_features = max_features
         if max_features is not None:
             if not isinstance(max_features, numbers.Integral) or max_features <= 0:

From 5b004f5eb61f99df4a61e308a6955ccf244febf2 Mon Sep 17 00:00:00 2001
From: Alek Lefebvre <info@aleklefebvre.ca>
Date: Sun, 15 Aug 2021 14:21:18 -0400
Subject: [PATCH 2/6] changelog

---
 doc/whats_new/v1.0.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst
index b4fcbee992383..3a6d6124feff4 100644
--- a/doc/whats_new/v1.0.rst
+++ b/doc/whats_new/v1.0.rst
@@ -366,6 +366,10 @@ Changelog
   error with unsupported value type.
   :pr:`19520` by :user:`Jeff Zhao <kamiyaa>`.
 
+- |Fix| Fixed a bug in :class:`feature_extraction.CountVectorizer` by raising an
+  error when min_idf or max_idf is a float and > 1.0.
+  :pr:`20752` by :user:`Alek Lefebvre <AlekLefebvre>`.
+
 :mod:`sklearn.feature_selection`
 ................................
 

From 06f70e9c927b186fdcde265e476aa5751bbce7e7 Mon Sep 17 00:00:00 2001
From: Alek Lefebvre <info@aleklefebvre.ca>
Date: Sat, 11 Sep 2021 06:57:39 -0400
Subject: [PATCH 3/6] removed debug file

---
 debug.py | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 debug.py

diff --git a/debug.py b/debug.py
deleted file mode 100644
index 52c2515bd3018..0000000000000
--- a/debug.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from sklearn.feature_extraction.text import CountVectorizer
-
-corpus = [
-    "This is the first document.",
-    "This document is the second document.",
-    "And this is the third one.",
-    "Is this the first document?",
-]
-
-vectorizer = CountVectorizer(analyzer="word", min_df=2, max_df=3)
-X = vectorizer.fit_transform(corpus)
-print(vectorizer.get_feature_names())
-# result ['document', 'first']
-
-vectorizer = CountVectorizer(analyzer="word", min_df=2, max_df=3.0)
-X = vectorizer.fit_transform(corpus)
-print(vectorizer.get_feature_names())
-# result ['document', 'first', 'is', 'the', 'this']

From 0a9640c4c71ea450f2a28e688cf937108f437525 Mon Sep 17 00:00:00 2001
From: Alek Lefebvre <info@aleklefebvre.ca>
Date: Sun, 19 Sep 2021 11:03:13 -0400
Subject: [PATCH 4/6] refactor validation and parametrize tests

---
 doc/whats_new/v1.1.rst                        |  2 +-
 sklearn/feature_extraction/tests/test_text.py | 26 +++++--------
 sklearn/feature_extraction/text.py            | 39 ++++++++++---------
 3 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index ed1f7f42b43e0..e28f055f75db6 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -43,7 +43,7 @@ Changelog
 
 - |Fix| Fixed a bug in :class:`feature_extraction.CountVectorizer` and
   :class:`feature_extraction.TfidfVectorizer` by raising an
-  error when min_idf or max_idf is a float and > 1.0.
+  error when 'min_idf' or 'max_idf' are floating-point numbers greater than 1.
   :pr:`20752` by :user:`Alek Lefebvre <AlekLefebvre>`.
 
 :mod:`sklearn.utils`
diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index dd651bb0174b1..e1d42592f0c03 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -832,25 +832,17 @@ def test_vectorizer_min_df():
     assert len(vect.stop_words_) == 5
 
 
-def test_vectorizer_max_df_unwanted_float():
-    message = (
-        "max_df is out of range for a float value. Use an int or a float between 0.0"
-        " and 1.0."
-    )
-    with pytest.raises(ValueError, match=message):
-        test_data = ["abc", "dea", "eat"]
-        vect = CountVectorizer(analyzer="char", max_df=2.0)
-        vect.fit(test_data)
-
-
-def test_vectorizer_min_df_unwanted_float():
-    message = (
-        "min_df is out of range for a float value. Use an int or a float between 0.0"
-        " and 1.0."
-    )
+@pytest.mark.parametrize(
+    "min_df, max_df, message",
+    (
+        (None, 2.0, "max_df == 2.0, must be <= 1.0."),
+        (1.5, None, "min_df == 1.5, must be <= 1.0."),
+    ),
+)
+def test_vectorizer_max_df_unwanted_float(min_df, max_df, message):
     with pytest.raises(ValueError, match=message):
         test_data = ["abc", "dea", "eat"]
-        vect = CountVectorizer(analyzer="char", min_df=1.5)
+        vect = CountVectorizer(analyzer="char", min_df=min_df, max_df=max_df)
         vect.fit(test_data)
 
 
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 02f248ea2704f..557280acb9111 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -29,7 +29,7 @@
 from ..preprocessing import normalize
 from ._hash import FeatureHasher
 from ._stop_words import ENGLISH_STOP_WORDS
-from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES
+from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES, check_scalar
 from ..utils.deprecation import deprecated
 from ..utils import _IS_32BIT
 from ..utils.fixes import _astype_copy_false
@@ -1103,25 +1103,7 @@ def __init__(
         self.stop_words = stop_words
         self.max_df = max_df
         self.min_df = min_df
-        if max_df < 0 or min_df < 0:
-            raise ValueError("negative value for max_df or min_df")
-        if not isinstance(min_df, numbers.Integral) and min_df > 1.0:
-            raise ValueError(
-                "min_df is out of range for a float value. Use an int or a float"
-                " between 0.0 and 1.0."
-            )
-        if not isinstance(max_df, numbers.Integral) and max_df > 1.0:
-            raise ValueError(
-                "max_df is out of range for a float value. Use an int or a float"
-                " between 0.0 and 1.0."
-            )
         self.max_features = max_features
-        if max_features is not None:
-            if not isinstance(max_features, numbers.Integral) or max_features <= 0:
-                raise ValueError(
-                    "max_features=%r, neither a positive integer nor None"
-                    % max_features
-                )
         self.ngram_range = ngram_range
         self.vocabulary = vocabulary
         self.binary = binary
@@ -1258,6 +1240,25 @@ def _count_vocab(self, raw_documents, fixed_vocab):
         X.sort_indices()
         return vocabulary, X
 
+    def _validate_params(self):
+        """Validation of min_df, max_df and max_features"""
+        super()._validate_params()
+
+        if self.max_features is not None:
+            check_scalar(self.max_features, "max_features", int, min_val=0)
+
+        if self.min_df is not None:
+            try:
+                check_scalar(self.min_df, "min_df", int, min_val=0)
+            except TypeError:
+                check_scalar(self.min_df, "min_df", float, min_val=0.0, max_val=1.0)
+
+        if self.max_df is not None:
+            try:
+                check_scalar(self.max_df, "max_df", int, min_val=0)
+            except TypeError:
+                check_scalar(self.max_df, "max_df", float, min_val=0.0, max_val=1.0)
+
     def fit(self, raw_documents, y=None):
         """Learn a vocabulary dictionary of all tokens in the raw documents.
 

From 83233c14e21e97c7ffd3c63270dfcfac870e489d Mon Sep 17 00:00:00 2001
From: Alek Lefebvre <info@aleklefebvre.ca>
Date: Mon, 20 Sep 2021 10:49:26 -0400
Subject: [PATCH 5/6] resolve conflicts

---
 doc/whats_new/v1.1.rst | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
index 15c75b79db924..b871ad768dfdc 100644
--- a/doc/whats_new/v1.1.rst
+++ b/doc/whats_new/v1.1.rst
@@ -61,8 +61,6 @@ Changelog
   :pr:`20880` by :user:`Guillaume Lemaitre <glemaitre>`
   and :user:`András Simon <simonandras>`.
 
-<<<<<<< HEAD
-=======
 :mod:`sklearn.pipeline`
 .......................
 
@@ -70,7 +68,6 @@ Changelog
   Setting a transformer to "passthrough" will pass the features unchanged.
   :pr:`20860` by :user:`Shubhraneel Pal <shubhraneel>`.
 
->>>>>>> main
 Code and Documentation Contributors
 -----------------------------------
 

From caa022b2c01fb56c116a8bcd0b0e045e8aeea535 Mon Sep 17 00:00:00 2001
From: Alek Lefebvre <info@aleklefebvre.ca>
Date: Fri, 1 Oct 2021 11:06:53 -0400
Subject: [PATCH 6/6] improve validation and test coverage

---
 sklearn/feature_extraction/tests/test_text.py | 23 +++++++++++++-----
 sklearn/feature_extraction/text.py            | 24 +++++++++----------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py
index 34c40a5a20ec5..6abd731b4559a 100644
--- a/sklearn/feature_extraction/tests/test_text.py
+++ b/sklearn/feature_extraction/tests/test_text.py
@@ -833,16 +833,27 @@ def test_vectorizer_min_df():
 
 
 @pytest.mark.parametrize(
-    "min_df, max_df, message",
+    "params, err_type, message",
     (
-        (None, 2.0, "max_df == 2.0, must be <= 1.0."),
-        (1.5, None, "min_df == 1.5, must be <= 1.0."),
+        ({"max_df": 2.0}, ValueError, "max_df == 2.0, must be <= 1.0."),
+        ({"min_df": 1.5}, ValueError, "min_df == 1.5, must be <= 1.0."),
+        ({"max_df": -2}, ValueError, "max_df == -2, must be >= 0."),
+        ({"min_df": -10}, ValueError, "min_df == -10, must be >= 0."),
+        ({"min_df": 3, "max_df": 2.0}, ValueError, "max_df == 2.0, must be <= 1.0."),
+        ({"min_df": 1.5, "max_df": 50}, ValueError, "min_df == 1.5, must be <= 1.0."),
+        ({"max_features": -10}, ValueError, "max_features == -10, must be >= 0."),
+        (
+            {"max_features": 3.5},
+            TypeError,
+            "max_features must be an instance of <class 'numbers.Integral'>, not <class"
+            " 'float'>",
+        ),
     ),
 )
-def test_vectorizer_max_df_unwanted_float(min_df, max_df, message):
-    with pytest.raises(ValueError, match=message):
+def test_vectorizer_params_validation(params, err_type, message):
+    with pytest.raises(err_type, match=message):
         test_data = ["abc", "dea", "eat"]
-        vect = CountVectorizer(analyzer="char", min_df=min_df, max_df=max_df)
+        vect = CountVectorizer(**params, analyzer="char")
         vect.fit(test_data)
 
 
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 644fc050aabcb..2163a2c7fc77a 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -1262,19 +1262,17 @@ def _validate_params(self):
         super()._validate_params()
 
         if self.max_features is not None:
-            check_scalar(self.max_features, "max_features", int, min_val=0)
-
-        if self.min_df is not None:
-            try:
-                check_scalar(self.min_df, "min_df", int, min_val=0)
-            except TypeError:
-                check_scalar(self.min_df, "min_df", float, min_val=0.0, max_val=1.0)
-
-        if self.max_df is not None:
-            try:
-                check_scalar(self.max_df, "max_df", int, min_val=0)
-            except TypeError:
-                check_scalar(self.max_df, "max_df", float, min_val=0.0, max_val=1.0)
+            check_scalar(self.max_features, "max_features", numbers.Integral, min_val=0)
+
+        if isinstance(self.min_df, numbers.Integral):
+            check_scalar(self.min_df, "min_df", numbers.Integral, min_val=0)
+        else:
+            check_scalar(self.min_df, "min_df", numbers.Real, min_val=0.0, max_val=1.0)
+
+        if isinstance(self.max_df, numbers.Integral):
+            check_scalar(self.max_df, "max_df", numbers.Integral, min_val=0)
+        else:
+            check_scalar(self.max_df, "max_df", numbers.Real, min_val=0.0, max_val=1.0)
 
     def fit(self, raw_documents, y=None):
         """Learn a vocabulary dictionary of all tokens in the raw documents.