From 937666a4fd4bda4fc0b119b7446d30d28f57c9c7 Mon Sep 17 00:00:00 2001 From: Alek Lefebvre Date: Sun, 15 Aug 2021 14:05:00 -0400 Subject: [PATCH 1/6] Throw error when min_idf or max_idf is float and > 1 --- sklearn/feature_extraction/tests/test_text.py | 22 +++++++++++++++++++ sklearn/feature_extraction/text.py | 10 +++++++++ 2 files changed, 32 insertions(+) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 5b09325c61031..3cb20d6bb20f7 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -805,6 +805,28 @@ def test_vectorizer_min_df(): assert len(vect.stop_words_) == 5 +def test_vectorizer_max_df_unwanted_float(): + message = ( + "max_df is out of range for a float value. Use an int or a float between 0.0" + " and 1.0." + ) + with pytest.raises(ValueError, match=message): + test_data = ["abc", "dea", "eat"] + vect = CountVectorizer(analyzer="char", max_df=2.0) + vect.fit(test_data) + + +def test_vectorizer_min_df_unwanted_float(): + message = ( + "min_df is out of range for a float value. Use an int or a float between 0.0" + " and 1.0." + ) + with pytest.raises(ValueError, match=message): + test_data = ["abc", "dea", "eat"] + vect = CountVectorizer(analyzer="char", min_df=1.5) + vect.fit(test_data) + + def test_count_binary_occurrences(): # by default multiple occurrences are counted as longs test_data = ["aaabc", "abbde"] diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 24d9b07023fd3..d98386d03e209 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1103,6 +1103,16 @@ def __init__( self.min_df = min_df if max_df < 0 or min_df < 0: raise ValueError("negative value for max_df or min_df") + if not isinstance(min_df, numbers.Integral) and min_df > 1.0: + raise ValueError( + "min_df is out of range for a float value. Use an int or a float" + " between 0.0 and 1.0." + ) + if not isinstance(max_df, numbers.Integral) and max_df > 1.0: + raise ValueError( + "max_df is out of range for a float value. Use an int or a float" + " between 0.0 and 1.0." + ) self.max_features = max_features if max_features is not None: if not isinstance(max_features, numbers.Integral) or max_features <= 0: From 5b004f5eb61f99df4a61e308a6955ccf244febf2 Mon Sep 17 00:00:00 2001 From: Alek Lefebvre Date: Sun, 15 Aug 2021 14:21:18 -0400 Subject: [PATCH 2/6] changelog --- doc/whats_new/v1.0.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index b4fcbee992383..3a6d6124feff4 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -366,6 +366,10 @@ Changelog error with unsupported value type. :pr:`19520` by :user:`Jeff Zhao `. +- |Fix| Fixed a bug in :class:`feature_extraction.CountVectorizer` by raising an + error when min_idf or max_idf is a float and > 1.0. + :pr:`20752` by :user:`Alek Lefebvre `. + :mod:`sklearn.feature_selection` ................................ From 06f70e9c927b186fdcde265e476aa5751bbce7e7 Mon Sep 17 00:00:00 2001 From: Alek Lefebvre Date: Sat, 11 Sep 2021 06:57:39 -0400 Subject: [PATCH 3/6] removed debug file --- debug.py | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 debug.py diff --git a/debug.py b/debug.py deleted file mode 100644 index 52c2515bd3018..0000000000000 --- a/debug.py +++ /dev/null @@ -1,18 +0,0 @@ -from sklearn.feature_extraction.text import CountVectorizer - -corpus = [ - "This is the first document.", - "This document is the second document.", - "And this is the third one.", - "Is this the first document?", -] - -vectorizer = CountVectorizer(analyzer="word", min_df=2, max_df=3) -X = vectorizer.fit_transform(corpus) -print(vectorizer.get_feature_names()) -# result ['document', 'first'] - -vectorizer = CountVectorizer(analyzer="word", min_df=2, max_df=3.0) -X = vectorizer.fit_transform(corpus) -print(vectorizer.get_feature_names()) -# result ['document', 'first', 'is', 'the', 'this'] From 0a9640c4c71ea450f2a28e688cf937108f437525 Mon Sep 17 00:00:00 2001 From: Alek Lefebvre Date: Sun, 19 Sep 2021 11:03:13 -0400 Subject: [PATCH 4/6] refactor validation and parametrize tests --- doc/whats_new/v1.1.rst | 2 +- sklearn/feature_extraction/tests/test_text.py | 26 +++++-------- sklearn/feature_extraction/text.py | 39 ++++++++++--------- 3 files changed, 30 insertions(+), 37 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index ed1f7f42b43e0..e28f055f75db6 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -43,7 +43,7 @@ Changelog - |Fix| Fixed a bug in :class:`feature_extraction.CountVectorizer` and :class:`feature_extraction.TfidfVectorizer` by raising an - error when min_idf or max_idf is a float and > 1.0. + error when 'min_idf' or 'max_idf' are floating-point numbers greater than 1. :pr:`20752` by :user:`Alek Lefebvre `. :mod:`sklearn.utils` diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index dd651bb0174b1..e1d42592f0c03 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -832,25 +832,17 @@ def test_vectorizer_min_df(): assert len(vect.stop_words_) == 5 -def test_vectorizer_max_df_unwanted_float(): - message = ( - "max_df is out of range for a float value. Use an int or a float between 0.0" - " and 1.0." - ) - with pytest.raises(ValueError, match=message): - test_data = ["abc", "dea", "eat"] - vect = CountVectorizer(analyzer="char", max_df=2.0) - vect.fit(test_data) - - -def test_vectorizer_min_df_unwanted_float(): - message = ( - "min_df is out of range for a float value. Use an int or a float between 0.0" - " and 1.0." - ) +@pytest.mark.parametrize( + "min_df, max_df, message", + ( + (None, 2.0, "max_df == 2.0, must be <= 1.0."), + (1.5, None, "min_df == 1.5, must be <= 1.0."), + ), +) +def test_vectorizer_max_df_unwanted_float(min_df, max_df, message): with pytest.raises(ValueError, match=message): test_data = ["abc", "dea", "eat"] - vect = CountVectorizer(analyzer="char", min_df=1.5) + vect = CountVectorizer(analyzer="char", min_df=min_df, max_df=max_df) vect.fit(test_data) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 02f248ea2704f..557280acb9111 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -29,7 +29,7 @@ from ..preprocessing import normalize from ._hash import FeatureHasher from ._stop_words import ENGLISH_STOP_WORDS -from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES +from ..utils.validation import check_is_fitted, check_array, FLOAT_DTYPES, check_scalar from ..utils.deprecation import deprecated from ..utils import _IS_32BIT from ..utils.fixes import _astype_copy_false @@ -1103,25 +1103,7 @@ def __init__( self.stop_words = stop_words self.max_df = max_df self.min_df = min_df - if max_df < 0 or min_df < 0: - raise ValueError("negative value for max_df or min_df") - if not isinstance(min_df, numbers.Integral) and min_df > 1.0: - raise ValueError( - "min_df is out of range for a float value. Use an int or a float" - " between 0.0 and 1.0." - ) - if not isinstance(max_df, numbers.Integral) and max_df > 1.0: - raise ValueError( - "max_df is out of range for a float value. Use an int or a float" - " between 0.0 and 1.0." - ) self.max_features = max_features - if max_features is not None: - if not isinstance(max_features, numbers.Integral) or max_features <= 0: - raise ValueError( - "max_features=%r, neither a positive integer nor None" - % max_features - ) self.ngram_range = ngram_range self.vocabulary = vocabulary self.binary = binary @@ -1258,6 +1240,25 @@ def _count_vocab(self, raw_documents, fixed_vocab): X.sort_indices() return vocabulary, X + def _validate_params(self): + """Validation of min_df, max_df and max_features""" + super()._validate_params() + + if self.max_features is not None: + check_scalar(self.max_features, "max_features", int, min_val=0) + + if self.min_df is not None: + try: + check_scalar(self.min_df, "min_df", int, min_val=0) + except TypeError: + check_scalar(self.min_df, "min_df", float, min_val=0.0, max_val=1.0) + + if self.max_df is not None: + try: + check_scalar(self.max_df, "max_df", int, min_val=0) + except TypeError: + check_scalar(self.max_df, "max_df", float, min_val=0.0, max_val=1.0) + def fit(self, raw_documents, y=None): """Learn a vocabulary dictionary of all tokens in the raw documents. From 83233c14e21e97c7ffd3c63270dfcfac870e489d Mon Sep 17 00:00:00 2001 From: Alek Lefebvre Date: Mon, 20 Sep 2021 10:49:26 -0400 Subject: [PATCH 5/6] resolve conflicts --- doc/whats_new/v1.1.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 15c75b79db924..b871ad768dfdc 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -61,8 +61,6 @@ Changelog :pr:`20880` by :user:`Guillaume Lemaitre ` and :user:`AndrĂ¡s Simon `. -<<<<<<< HEAD -======= :mod:`sklearn.pipeline` ....................... @@ -70,7 +68,6 @@ Changelog Setting a transformer to "passthrough" will pass the features unchanged. :pr:`20860` by :user:`Shubhraneel Pal `. ->>>>>>> main Code and Documentation Contributors ----------------------------------- From caa022b2c01fb56c116a8bcd0b0e045e8aeea535 Mon Sep 17 00:00:00 2001 From: Alek Lefebvre Date: Fri, 1 Oct 2021 11:06:53 -0400 Subject: [PATCH 6/6] improve validation and test coverage --- sklearn/feature_extraction/tests/test_text.py | 23 +++++++++++++----- sklearn/feature_extraction/text.py | 24 +++++++++---------- 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 34c40a5a20ec5..6abd731b4559a 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -833,16 +833,27 @@ def test_vectorizer_min_df(): @pytest.mark.parametrize( - "min_df, max_df, message", + "params, err_type, message", ( - (None, 2.0, "max_df == 2.0, must be <= 1.0."), - (1.5, None, "min_df == 1.5, must be <= 1.0."), + ({"max_df": 2.0}, ValueError, "max_df == 2.0, must be <= 1.0."), + ({"min_df": 1.5}, ValueError, "min_df == 1.5, must be <= 1.0."), + ({"max_df": -2}, ValueError, "max_df == -2, must be >= 0."), + ({"min_df": -10}, ValueError, "min_df == -10, must be >= 0."), + ({"min_df": 3, "max_df": 2.0}, ValueError, "max_df == 2.0, must be <= 1.0."), + ({"min_df": 1.5, "max_df": 50}, ValueError, "min_df == 1.5, must be <= 1.0."), + ({"max_features": -10}, ValueError, "max_features == -10, must be >= 0."), + ( + {"max_features": 3.5}, + TypeError, + "max_features must be an instance of , not ", + ), ), ) -def test_vectorizer_max_df_unwanted_float(min_df, max_df, message): - with pytest.raises(ValueError, match=message): +def test_vectorizer_params_validation(params, err_type, message): + with pytest.raises(err_type, match=message): test_data = ["abc", "dea", "eat"] - vect = CountVectorizer(analyzer="char", min_df=min_df, max_df=max_df) + vect = CountVectorizer(**params, analyzer="char") vect.fit(test_data) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 644fc050aabcb..2163a2c7fc77a 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -1262,19 +1262,17 @@ def _validate_params(self): super()._validate_params() if self.max_features is not None: - check_scalar(self.max_features, "max_features", int, min_val=0) - - if self.min_df is not None: - try: - check_scalar(self.min_df, "min_df", int, min_val=0) - except TypeError: - check_scalar(self.min_df, "min_df", float, min_val=0.0, max_val=1.0) - - if self.max_df is not None: - try: - check_scalar(self.max_df, "max_df", int, min_val=0) - except TypeError: - check_scalar(self.max_df, "max_df", float, min_val=0.0, max_val=1.0) + check_scalar(self.max_features, "max_features", numbers.Integral, min_val=0) + + if isinstance(self.min_df, numbers.Integral): + check_scalar(self.min_df, "min_df", numbers.Integral, min_val=0) + else: + check_scalar(self.min_df, "min_df", numbers.Real, min_val=0.0, max_val=1.0) + + if isinstance(self.max_df, numbers.Integral): + check_scalar(self.max_df, "max_df", numbers.Integral, min_val=0) + else: + check_scalar(self.max_df, "max_df", numbers.Real, min_val=0.0, max_val=1.0) def fit(self, raw_documents, y=None): """Learn a vocabulary dictionary of all tokens in the raw documents.