From 44238a8e1e31969d65e2759f8088b025dbb40c44 Mon Sep 17 00:00:00 2001 From: rprkh Date: Thu, 26 Jan 2023 20:47:19 +0530 Subject: [PATCH 1/2] max_df range --- sklearn/feature_extraction/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index c03a4767a3330..ad83a134e28c8 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -996,8 +996,8 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. - If None, no stop words will be used. max_df can be set to a value - in the range [0.7, 1.0) to automatically detect and filter stop + If None, no stop words will be used. ``max_df`` can be set to a value + in the range [0.0, 1.0] to automatically detect and filter stop words based on intra corpus document frequency of terms. token_pattern : str or None, default=r"(?u)\\b\\w\\w+\\b" From a96b911105af5ae45276b60dd814091b51e2da5b Mon Sep 17 00:00:00 2001 From: Rahil Parikh Date: Thu, 9 Feb 2023 18:01:34 +0530 Subject: [PATCH 2/2] add suggestions --- sklearn/feature_extraction/text.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index ad83a134e28c8..72f836c25839f 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -996,9 +996,9 @@ class CountVectorizer(_VectorizerMixin, BaseEstimator): will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. - If None, no stop words will be used. ``max_df`` can be set to a value - in the range [0.0, 1.0] to automatically detect and filter stop - words based on intra corpus document frequency of terms. + If None, no stop words will be used. In this case, setting `max_df` + to a higher value, such as in the range (0.7, 1.0), can automatically detect + and filter stop words based on intra corpus document frequency of terms. token_pattern : str or None, default=r"(?u)\\b\\w\\w+\\b" Regular expression denoting what constitutes a "token", only used @@ -1833,9 +1833,9 @@ class TfidfVectorizer(CountVectorizer): will be removed from the resulting tokens. Only applies if ``analyzer == 'word'``. - If None, no stop words will be used. max_df can be set to a value - in the range [0.7, 1.0) to automatically detect and filter stop - words based on intra corpus document frequency of terms. + If None, no stop words will be used. In this case, setting `max_df` + to a higher value, such as in the range (0.7, 1.0), can automatically detect + and filter stop words based on intra corpus document frequency of terms. token_pattern : str, default=r"(?u)\\b\\w\\w+\\b" Regular expression denoting what constitutes a "token", only used