From 282a7ddb0fde8a7aa60df3ebf6c0b6f3c936cbad Mon Sep 17 00:00:00 2001 From: arka204 Date: Sun, 22 Mar 2020 11:06:32 -0700 Subject: [PATCH 01/20] Adding variable alphaCorrection to classes in naive_bayes.py. --- sklearn/naive_bayes.py | 47 ++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 6ef3895ffdb60..e69751f81ab2c 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -504,9 +504,13 @@ def _check_alpha(self): raise ValueError("alpha should be a scalar or a numpy array " "with shape [n_features]") if np.min(self.alpha) < _ALPHA_MIN: - warnings.warn('alpha too small will result in numeric errors, ' - 'setting alpha = %.1e' % _ALPHA_MIN) - return np.maximum(self.alpha, _ALPHA_MIN) + if self.alphaCorrection: + warnings.warn('alpha too small will result in numeric errors, ' + 'setting alpha = %.1e' % _ALPHA_MIN) + return np.maximum(self.alpha, _ALPHA_MIN) + else: + warnings.warn('alpha too small will result in numeric errors, ' + 'alphaCorrection was set to False, proceeding without changing alpha.') return self.alpha def partial_fit(self, X, y, classes=None, sample_weight=None): @@ -677,7 +681,11 @@ class MultinomialNB(_BaseDiscreteNB): ---------- alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter - (0 for no smoothing). + (set alpha=0 and alphaCorrection=False, for no smoothing). + + alphaCorrection : bool, default=True + In case alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If false, warn user about potential errors and proceed with alpha unchanged. fit_prior : bool, default=True Whether to learn class prior probabilities or not. @@ -745,8 +753,9 @@ class MultinomialNB(_BaseDiscreteNB): https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html """ - def __init__(self, alpha=1.0, fit_prior=True, class_prior=None): + def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, class_prior=None): self.alpha = alpha + self.alphaCorrection = alphaCorrection self.fit_prior = fit_prior self.class_prior = class_prior @@ -785,7 +794,12 @@ class ComplementNB(_BaseDiscreteNB): Parameters ---------- alpha : float, default=1.0 - Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing). + Additive (Laplace/Lidstone) smoothing parameter + (set alpha=0 and alphaCorrection=False, for no smoothing). + + alphaCorrection : bool, default=True + In case alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If false, warn user about potential errors and proceed with alpha unchanged. fit_prior : bool, default=True Only used in edge case with a single class in the training set. @@ -847,9 +861,10 @@ class ComplementNB(_BaseDiscreteNB): https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf """ - def __init__(self, alpha=1.0, fit_prior=True, class_prior=None, + def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, class_prior=None, norm=False): self.alpha = alpha + self.alphaCorrection = alphaCorrection self.fit_prior = fit_prior self.class_prior = class_prior self.norm = norm @@ -897,7 +912,11 @@ class BernoulliNB(_BaseDiscreteNB): ---------- alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter - (0 for no smoothing). + (set alpha=0 and alphaCorrection=False, for no smoothing). + + alphaCorrection : bool, default=True + In case alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If false, warn user about potential errors and proceed with alpha unchanged. binarize : float or None, default=0.0 Threshold for binarizing (mapping to booleans) of sample features. @@ -961,9 +980,10 @@ class BernoulliNB(_BaseDiscreteNB): naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ - def __init__(self, alpha=1.0, binarize=.0, fit_prior=True, + def __init__(self, alpha=1.0, alphaCorrection=True, binarize=.0, fit_prior=True, class_prior=None): self.alpha = alpha + self.alphaCorrection = alphaCorrection self.binarize = binarize self.fit_prior = fit_prior self.class_prior = class_prior @@ -1023,7 +1043,11 @@ class CategoricalNB(_BaseDiscreteNB): ---------- alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter - (0 for no smoothing). + (set alpha=0 and alphaCorrection=False, for no smoothing). + + alphaCorrection : bool, default=True + In case alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If false, warn user about potential errors and proceed with alpha unchanged. fit_prior : bool, default=True Whether to learn class prior probabilities or not. @@ -1072,8 +1096,9 @@ class CategoricalNB(_BaseDiscreteNB): [3] """ - def __init__(self, alpha=1.0, fit_prior=True, class_prior=None): + def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, class_prior=None): self.alpha = alpha + self.alphaCorrection = alphaCorrection self.fit_prior = fit_prior self.class_prior = class_prior From d78e17bec15a71d88e0579c31611b80fcc46f3b8 Mon Sep 17 00:00:00 2001 From: arka204 Date: Sun, 22 Mar 2020 12:18:31 -0700 Subject: [PATCH 02/20] Splitting few lines of code. --- sklearn/naive_bayes.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index e69751f81ab2c..440e8fcb6eb02 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -753,7 +753,8 @@ class MultinomialNB(_BaseDiscreteNB): https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html """ - def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, class_prior=None): + def __init__(self, alpha=1.0, alphaCorrection=True, + fit_prior=True, class_prior=None): self.alpha = alpha self.alphaCorrection = alphaCorrection self.fit_prior = fit_prior @@ -861,8 +862,8 @@ class ComplementNB(_BaseDiscreteNB): https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf """ - def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, class_prior=None, - norm=False): + def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, + class_prior=None, norm=False): self.alpha = alpha self.alphaCorrection = alphaCorrection self.fit_prior = fit_prior @@ -980,8 +981,8 @@ class BernoulliNB(_BaseDiscreteNB): naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ - def __init__(self, alpha=1.0, alphaCorrection=True, binarize=.0, fit_prior=True, - class_prior=None): + def __init__(self, alpha=1.0, alphaCorrection=True, binarize=.0, + fit_prior=True, class_prior=None): self.alpha = alpha self.alphaCorrection = alphaCorrection self.binarize = binarize @@ -1096,7 +1097,8 @@ class CategoricalNB(_BaseDiscreteNB): [3] """ - def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, class_prior=None): + def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, + class_prior=None): self.alpha = alpha self.alphaCorrection = alphaCorrection self.fit_prior = fit_prior From a4429bfedad92d518fc38f27d2a51814530c068a Mon Sep 17 00:00:00 2001 From: arka204 Date: Thu, 21 May 2020 21:11:40 +0200 Subject: [PATCH 03/20] Fixing problems and adding tests. --- sklearn/naive_bayes.py | 63 +++++++++++++++++-------------- sklearn/tests/test_naive_bayes.py | 11 ++++++ 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 440e8fcb6eb02..52a0e531685ad 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -504,13 +504,14 @@ def _check_alpha(self): raise ValueError("alpha should be a scalar or a numpy array " "with shape [n_features]") if np.min(self.alpha) < _ALPHA_MIN: - if self.alphaCorrection: + if self.force_alpha: warnings.warn('alpha too small will result in numeric errors, ' - 'setting alpha = %.1e' % _ALPHA_MIN) - return np.maximum(self.alpha, _ALPHA_MIN) + 'force_alpha was set to True, ' + 'proceeding without changing alpha.') else: warnings.warn('alpha too small will result in numeric errors, ' - 'alphaCorrection was set to False, proceeding without changing alpha.') + 'setting alpha = %.1e' % _ALPHA_MIN) + return np.maximum(self.alpha, _ALPHA_MIN) return self.alpha def partial_fit(self, X, y, classes=None, sample_weight=None): @@ -681,11 +682,12 @@ class MultinomialNB(_BaseDiscreteNB): ---------- alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter - (set alpha=0 and alphaCorrection=False, for no smoothing). - - alphaCorrection : bool, default=True - In case alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If false, warn user about potential errors and proceed with alpha unchanged. + (set alpha=0 and force_alpha=True, for no smoothing). + + force_alpha : bool, default=False + If false and alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If true, warn user about potential numeric errors + and proceed with alpha unchanged. fit_prior : bool, default=True Whether to learn class prior probabilities or not. @@ -753,10 +755,10 @@ class MultinomialNB(_BaseDiscreteNB): https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html """ - def __init__(self, alpha=1.0, alphaCorrection=True, + def __init__(self, alpha=1.0, force_alpha=False, fit_prior=True, class_prior=None): self.alpha = alpha - self.alphaCorrection = alphaCorrection + self.force_alpha = force_alpha self.fit_prior = fit_prior self.class_prior = class_prior @@ -796,11 +798,12 @@ class ComplementNB(_BaseDiscreteNB): ---------- alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter - (set alpha=0 and alphaCorrection=False, for no smoothing). + (set alpha=0 and force_alpha=True, for no smoothing). - alphaCorrection : bool, default=True - In case alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If false, warn user about potential errors and proceed with alpha unchanged. + force_alpha : bool, default=False + If false and alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If true, warn user about potential numeric errors + and proceed with alpha unchanged. fit_prior : bool, default=True Only used in edge case with a single class in the training set. @@ -862,10 +865,10 @@ class ComplementNB(_BaseDiscreteNB): https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf """ - def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, + def __init__(self, alpha=1.0, force_alpha=False, fit_prior=True, class_prior=None, norm=False): self.alpha = alpha - self.alphaCorrection = alphaCorrection + self.force_alpha = force_alpha self.fit_prior = fit_prior self.class_prior = class_prior self.norm = norm @@ -913,11 +916,12 @@ class BernoulliNB(_BaseDiscreteNB): ---------- alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter - (set alpha=0 and alphaCorrection=False, for no smoothing). + (set alpha=0 and force_alpha=True, for no smoothing). - alphaCorrection : bool, default=True - In case alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If false, warn user about potential errors and proceed with alpha unchanged. + force_alpha : bool, default=False + If false and alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If true, warn user about potential numeric errors + and proceed with alpha unchanged. binarize : float or None, default=0.0 Threshold for binarizing (mapping to booleans) of sample features. @@ -981,10 +985,10 @@ class BernoulliNB(_BaseDiscreteNB): naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ - def __init__(self, alpha=1.0, alphaCorrection=True, binarize=.0, + def __init__(self, alpha=1.0, force_alpha=False, binarize=.0, fit_prior=True, class_prior=None): self.alpha = alpha - self.alphaCorrection = alphaCorrection + self.force_alpha = force_alpha self.binarize = binarize self.fit_prior = fit_prior self.class_prior = class_prior @@ -1044,11 +1048,12 @@ class CategoricalNB(_BaseDiscreteNB): ---------- alpha : float, default=1.0 Additive (Laplace/Lidstone) smoothing parameter - (set alpha=0 and alphaCorrection=False, for no smoothing). + (set alpha=0 and force_alpha=True, for no smoothing). - alphaCorrection : bool, default=True - In case alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If false, warn user about potential errors and proceed with alpha unchanged. + force_alpha : bool, default=False + If false and alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If true, warn user about potential numeric errors + and proceed with alpha unchanged. fit_prior : bool, default=True Whether to learn class prior probabilities or not. @@ -1097,10 +1102,10 @@ class CategoricalNB(_BaseDiscreteNB): [3] """ - def __init__(self, alpha=1.0, alphaCorrection=True, fit_prior=True, + def __init__(self, alpha=1.0, force_alpha=False, fit_prior=True, class_prior=None): self.alpha = alpha - self.alphaCorrection = alphaCorrection + self.force_alpha = force_alpha self.fit_prior = fit_prior self.class_prior = class_prior diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 1f0f9347a188c..f15e1c62ddcf5 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -753,6 +753,17 @@ def test_alpha(): X, y, classes=[0, 1]) +def test_check_alpha(): + # Test force_alpha if alpha < _ALPHA_MIN + _ALPHA_MIN = 1e-10 # const + b = BernoulliNB(alpha=0, force_alpha=True) + assert b._check_alpha() == 0 + b = BernoulliNB(alpha=0, force_alpha=False) + assert b._check_alpha() == _ALPHA_MIN + b = BernoulliNB(alpha=0) + assert b._check_alpha() == _ALPHA_MIN + + def test_alpha_vector(): X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) From cf35eb1da2e29538b8400657b9777ba7a28b347c Mon Sep 17 00:00:00 2001 From: arka204 Date: Thu, 21 May 2020 21:52:45 +0200 Subject: [PATCH 04/20] Updating naive_bayes.py. --- sklearn/naive_bayes.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 52a0e531685ad..25fdb03c84e21 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -32,6 +32,7 @@ from .utils.multiclass import _check_partial_fit_first_call from .utils.validation import check_is_fitted, check_non_negative, column_or_1d from .utils.validation import _check_sample_weight +from .utils.validation import _deprecate_positional_args __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB', 'CategoricalNB'] @@ -755,7 +756,8 @@ class MultinomialNB(_BaseDiscreteNB): https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html """ - def __init__(self, alpha=1.0, force_alpha=False, + @_deprecate_positional_args + def __init__(self, *, alpha=1.0, force_alpha=False, fit_prior=True, class_prior=None): self.alpha = alpha self.force_alpha = force_alpha @@ -865,7 +867,8 @@ class ComplementNB(_BaseDiscreteNB): https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf """ - def __init__(self, alpha=1.0, force_alpha=False, fit_prior=True, + @_deprecate_positional_args + def __init__(self, *, alpha=1.0, force_alpha=False, fit_prior=True, class_prior=None, norm=False): self.alpha = alpha self.force_alpha = force_alpha @@ -985,7 +988,8 @@ class BernoulliNB(_BaseDiscreteNB): naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ - def __init__(self, alpha=1.0, force_alpha=False, binarize=.0, + @_deprecate_positional_args + def __init__(self, *, alpha=1.0, force_alpha=False, binarize=.0, fit_prior=True, class_prior=None): self.alpha = alpha self.force_alpha = force_alpha @@ -1102,7 +1106,8 @@ class CategoricalNB(_BaseDiscreteNB): [3] """ - def __init__(self, alpha=1.0, force_alpha=False, fit_prior=True, + @_deprecate_positional_args + def __init__(self, *, alpha=1.0, force_alpha=False, fit_prior=True, class_prior=None): self.alpha = alpha self.force_alpha = force_alpha From dcce4a8c2eaf69cc842f2e8ea6c9aa3436058d53 Mon Sep 17 00:00:00 2001 From: arka204 Date: Sun, 31 May 2020 20:56:49 +0200 Subject: [PATCH 05/20] Checkig warnings in tests. --- sklearn/tests/test_naive_bayes.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index f15e1c62ddcf5..95121aaf4015a 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -756,12 +756,20 @@ def test_alpha(): def test_check_alpha(): # Test force_alpha if alpha < _ALPHA_MIN _ALPHA_MIN = 1e-10 # const + msg1 = ('alpha too small will result in numeric errors, ' + 'force_alpha was set to True, ' + 'proceeding without changing alpha.') + msg2 = ('alpha too small will result in numeric errors, ' + 'setting alpha = %.1e' % _ALPHA_MIN) b = BernoulliNB(alpha=0, force_alpha=True) - assert b._check_alpha() == 0 + with pytest.warns(UserWarning, match=msg1): + assert b._check_alpha() == 0 b = BernoulliNB(alpha=0, force_alpha=False) - assert b._check_alpha() == _ALPHA_MIN + with pytest.warns(UserWarning, match=msg2): + assert b._check_alpha() == _ALPHA_MIN b = BernoulliNB(alpha=0) - assert b._check_alpha() == _ALPHA_MIN + with pytest.warns(UserWarning, match=msg2): + assert b._check_alpha() == _ALPHA_MIN def test_alpha_vector(): From be0ebfe5f3e5054b57ee1426f58ba3b7934b0e28 Mon Sep 17 00:00:00 2001 From: arka204 Date: Mon, 8 Jun 2020 23:59:45 +0200 Subject: [PATCH 06/20] Update v0.24.rst --- doc/whats_new/v0.24.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index db6959fcc164f..e503e28dd1ff8 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -76,6 +76,14 @@ Changelog attribute name/path or a `callable` for extracting feature importance from the estimator. :pr:`15361` by :user:`Venkatachalam N ` +:mod:`sklearn.naive_bayes` +.......................... + +- |Fix| A new parameter `force_alpha` was added to :class:`BernoulliNB` and + class:`MultinomialNB`, allowing user to set parameter alpha to a very + small number, greater or equal 0, which was earlier automatically changed + to `_ALPHA_MIN` instead. + :pr:`16747` by :user:`arka204` :mod:`sklearn.tree` ................... From 2d1609112023d728d6d3cfd1f565042838cd2f79 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Tue, 10 Nov 2020 17:14:32 +0800 Subject: [PATCH 07/20] Fix merge --- doc/whats_new/v0.24.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 81471d792cf2c..5875c03ceca3c 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -555,7 +555,8 @@ Changelog :class:`model_selection.RandomizedSearchCV` results in non-finite scores. :pr:`18266` by :user:`Subrat Sahu `, :user:`Nirvan ` and :user:`Arthur Book `. - - |Enhancement| :func:`model_selection.learning_curve` now accept fit_params + +- |Enhancement| :func:`model_selection.learning_curve` now accept fit_params to pass additional estimator parameters. :pr:`18595` by :user:`Amanda Dsouza `. @@ -607,7 +608,7 @@ Changelog class:`MultinomialNB`, allowing user to set parameter alpha to a very small number, greater or equal 0, which was earlier automatically changed to `_ALPHA_MIN` instead. - :pr:`16747` by :user:`arka204` + :pr:`16747`, :pr:`18805` by :user:`arka204` and :user:`hongshaoyang`. :mod:`sklearn.neighbors` ........................ From a8209e00eeb5ef99be0f630e9800000666e986db Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Sun, 20 Dec 2020 16:58:41 +0800 Subject: [PATCH 08/20] Move whatsnew --- doc/whats_new/v0.24.rst | 6 ------ doc/whats_new/v1.0.rst | 9 ++++++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.24.rst b/doc/whats_new/v0.24.rst index 2b5817367c73b..7197b74b94faa 100644 --- a/doc/whats_new/v0.24.rst +++ b/doc/whats_new/v0.24.rst @@ -655,12 +655,6 @@ Changelog and will be removed in v1.1 (renaming of 0.26). :pr:`17427` by :user:`Juan Carlos Alfaro Jiménez `. -- |Fix| A new parameter `force_alpha` was added to :class:`BernoulliNB` and - class:`MultinomialNB`, allowing user to set parameter alpha to a very - small number, greater or equal 0, which was earlier automatically changed - to `_ALPHA_MIN` instead. - :pr:`16747`, :pr:`18805` by :user:`arka204` and :user:`hongshaoyang`. - :mod:`sklearn.neighbors` ........................ diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 211f1e4049d65..26b0358ffe4a0 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -44,7 +44,14 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123456 is the *pull request* number, not the issue number. - +:mod:`sklearn.naive_bayes` +.......................... + +- |Fix| A new parameter `force_alpha` was added to :class:`BernoulliNB` and + class:`MultinomialNB`, allowing user to set parameter alpha to a very + small number, greater or equal 0, which was earlier automatically changed + to `_ALPHA_MIN` instead. + :pr:`16747`, :pr:`18805` by :user:`arka204` and :user:`hongshaoyang`. Code and Documentation Contributors ----------------------------------- From 2a2d8f38f9487c2e7dae977537397e02098d8672 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Mon, 8 Feb 2021 21:05:43 +0800 Subject: [PATCH 09/20] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- sklearn/naive_bayes.py | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index dd6433c9567f6..e388fcf4ba25f 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -711,9 +711,11 @@ class MultinomialNB(_BaseDiscreteNB): (set alpha=0 and force_alpha=True, for no smoothing). force_alpha : bool, default=False - If false and alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If true, warn user about potential numeric errors + If False and alpha is too close to 0, it will set alpha to _ALPHA_MIN. + If True, warn user about potential numeric errors and proceed with alpha unchanged. + + .. versionadded:: 1.0 fit_prior : bool, default=True Whether to learn class prior probabilities or not. @@ -838,10 +840,12 @@ class ComplementNB(_BaseDiscreteNB): (set alpha=0 and force_alpha=True, for no smoothing). force_alpha : bool, default=False - If false and alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If true, warn user about potential numeric errors + If False and alpha is too close to 0, it will set alpha to `_ALPHA_MIN`. + If True, warn user about potential numeric errors and proceed with alpha unchanged. + .. versionadded:: 1.0 + fit_prior : bool, default=True Only used in edge case with a single class in the training set. @@ -973,10 +977,12 @@ class BernoulliNB(_BaseDiscreteNB): (set alpha=0 and force_alpha=True, for no smoothing). force_alpha : bool, default=False - If false and alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If true, warn user about potential numeric errors + If False and alpha is too close to 0, it will set alpha to `_ALPHA_MIN`. + If True, warn user about potential numeric errors and proceed with alpha unchanged. + .. versionadded:: 1.0 + binarize : float or None, default=0.0 Threshold for binarizing (mapping to booleans) of sample features. If None, input is presumed to already consist of binary vectors. @@ -1114,8 +1120,12 @@ class CategoricalNB(_BaseDiscreteNB): (set alpha=0 and force_alpha=True, for no smoothing). force_alpha : bool, default=False - If false and alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If true, warn user about potential numeric errors + force_alpha : bool, default=False + If False and alpha is too close to 0, it will set alpha to `_ALPHA_MIN`. + If True, warn user about potential numeric errors + and proceed with alpha unchanged. + + .. versionadded:: 1.0 and proceed with alpha unchanged. fit_prior : bool, default=True From d8f784ebe33a111264ab2e9572d38999264b2258 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Tue, 9 Feb 2021 21:24:42 +0800 Subject: [PATCH 10/20] Remove extra line --- sklearn/naive_bayes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index e388fcf4ba25f..27c6faf327ed7 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -1119,7 +1119,6 @@ class CategoricalNB(_BaseDiscreteNB): Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing). - force_alpha : bool, default=False force_alpha : bool, default=False If False and alpha is too close to 0, it will set alpha to `_ALPHA_MIN`. If True, warn user about potential numeric errors From a3897f709142a23b4c42d1c5a016664c85160f57 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Tue, 9 Feb 2021 22:04:17 +0800 Subject: [PATCH 11/20] Flake8 --- sklearn/naive_bayes.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 27c6faf327ed7..cc2c7768f8c99 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -714,7 +714,7 @@ class MultinomialNB(_BaseDiscreteNB): If False and alpha is too close to 0, it will set alpha to _ALPHA_MIN. If True, warn user about potential numeric errors and proceed with alpha unchanged. - + .. versionadded:: 1.0 fit_prior : bool, default=True @@ -840,8 +840,8 @@ class ComplementNB(_BaseDiscreteNB): (set alpha=0 and force_alpha=True, for no smoothing). force_alpha : bool, default=False - If False and alpha is too close to 0, it will set alpha to `_ALPHA_MIN`. - If True, warn user about potential numeric errors + If False and alpha is too close to 0, it will set alpha to + `_ALPHA_MIN`. If True, warn user about potential numeric errors and proceed with alpha unchanged. .. versionadded:: 1.0 @@ -977,8 +977,8 @@ class BernoulliNB(_BaseDiscreteNB): (set alpha=0 and force_alpha=True, for no smoothing). force_alpha : bool, default=False - If False and alpha is too close to 0, it will set alpha to `_ALPHA_MIN`. - If True, warn user about potential numeric errors + If False and alpha is too close to 0, it will set alpha to + `_ALPHA_MIN`. If True, warn user about potential numeric errors and proceed with alpha unchanged. .. versionadded:: 1.0 @@ -1120,8 +1120,8 @@ class CategoricalNB(_BaseDiscreteNB): (set alpha=0 and force_alpha=True, for no smoothing). force_alpha : bool, default=False - If False and alpha is too close to 0, it will set alpha to `_ALPHA_MIN`. - If True, warn user about potential numeric errors + If False and alpha is too close to 0, it will set alpha to + `_ALPHA_MIN`. If True, warn user about potential numeric errors and proceed with alpha unchanged. .. versionadded:: 1.0 From 23c68dd4e13f417d2c32142fa1b2ecc9168cfb73 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Tue, 9 Feb 2021 22:11:39 +0800 Subject: [PATCH 12/20] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- sklearn/naive_bayes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index cc2c7768f8c99..8555dcbb1ff98 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -711,8 +711,8 @@ class MultinomialNB(_BaseDiscreteNB): (set alpha=0 and force_alpha=True, for no smoothing). force_alpha : bool, default=False - If False and alpha is too close to 0, it will set alpha to _ALPHA_MIN. - If True, warn user about potential numeric errors + If False and alpha is too close to 0, it will set alpha to + `_ALPHA_MIN`. If True, warn user about potential numeric errors and proceed with alpha unchanged. .. versionadded:: 1.0 From 1d01c6cbfe38dc9b2203484cf9a89be20386d653 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Sat, 29 May 2021 20:31:17 +0800 Subject: [PATCH 13/20] Fix merge --- sklearn/naive_bayes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index ecd7c50280efa..b61328caef7cf 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -1145,7 +1145,6 @@ class CategoricalNB(_BaseDiscreteNB): and proceed with alpha unchanged. .. versionadded:: 1.0 - and proceed with alpha unchanged. fit_prior : bool, default=True Whether to learn class prior probabilities or not. From aa1d8de2431d15a66af0b751956680f529216498 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Sat, 29 May 2021 20:47:08 +0800 Subject: [PATCH 14/20] use assert_warns_message --- sklearn/tests/test_naive_bayes.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index b8c12f9037328..ad994869c18e8 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -12,6 +12,7 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal +from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings from sklearn.naive_bayes import GaussianNB, BernoulliNB @@ -886,14 +887,14 @@ def test_check_alpha(): msg2 = ('alpha too small will result in numeric errors, ' 'setting alpha = %.1e' % _ALPHA_MIN) b = BernoulliNB(alpha=0, force_alpha=True) - with pytest.warns(UserWarning, match=msg1): - assert b._check_alpha() == 0 + assert_warns_message(UserWarning, msg1, b._check_alpha) + assert b._check_alpha() == 0 b = BernoulliNB(alpha=0, force_alpha=False) - with pytest.warns(UserWarning, match=msg2): - assert b._check_alpha() == _ALPHA_MIN + assert_warns_message(UserWarning, msg2, b._check_alpha) + assert b._check_alpha() == _ALPHA_MIN b = BernoulliNB(alpha=0) - with pytest.warns(UserWarning, match=msg2): - assert b._check_alpha() == _ALPHA_MIN + assert_warns_message(UserWarning, msg2, b._check_alpha) + assert b._check_alpha() == _ALPHA_MIN def test_alpha_vector(): From 203af9e2247952d0006abbe8c2952a5d0619b478 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Wed, 9 Jun 2021 16:23:59 +0800 Subject: [PATCH 15/20] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- sklearn/naive_bayes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index b61328caef7cf..ce92367458027 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -527,9 +527,9 @@ def _check_alpha(self): "with shape [n_features]") if np.min(self.alpha) < _ALPHA_MIN: if self.force_alpha: - warnings.warn('alpha too small will result in numeric errors, ' - 'force_alpha was set to True, ' - 'proceeding without changing alpha.') + warnings.warn('alpha too small will result in numeric errors. ' + 'Proceeding with alpha = %.1e, as ' + 'force_alpha was set to True.' % alpha) else: warnings.warn('alpha too small will result in numeric errors, ' 'setting alpha = %.1e' % _ALPHA_MIN) From 91127bcdd3193a87c821ae3b865abeb8da9176b6 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Wed, 9 Jun 2021 16:28:19 +0800 Subject: [PATCH 16/20] Fix wrong variable name --- sklearn/naive_bayes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index ce92367458027..c0765d8f18413 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -529,7 +529,7 @@ def _check_alpha(self): if self.force_alpha: warnings.warn('alpha too small will result in numeric errors. ' 'Proceeding with alpha = %.1e, as ' - 'force_alpha was set to True.' % alpha) + 'force_alpha was set to True.' % self.alpha) else: warnings.warn('alpha too small will result in numeric errors, ' 'setting alpha = %.1e' % _ALPHA_MIN) From c4d073658e01acbd953fccc28538d4e01eaa9112 Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Wed, 9 Jun 2021 19:22:26 +0800 Subject: [PATCH 17/20] Fix test to use "with pytest.warns" instead of assert_warns_message --- sklearn/tests/test_naive_bayes.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index ad994869c18e8..6fc96c3b9fae5 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -12,7 +12,6 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_almost_equal -from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings from sklearn.naive_bayes import GaussianNB, BernoulliNB @@ -881,20 +880,20 @@ def test_alpha(): def test_check_alpha(): # Test force_alpha if alpha < _ALPHA_MIN _ALPHA_MIN = 1e-10 # const - msg1 = ('alpha too small will result in numeric errors, ' - 'force_alpha was set to True, ' - 'proceeding without changing alpha.') + msg1 = ('alpha too small will result in numeric errors. ' + 'Proceeding with alpha = .+, as ' + 'force_alpha was set to True.') msg2 = ('alpha too small will result in numeric errors, ' 'setting alpha = %.1e' % _ALPHA_MIN) b = BernoulliNB(alpha=0, force_alpha=True) - assert_warns_message(UserWarning, msg1, b._check_alpha) - assert b._check_alpha() == 0 + with pytest.warns(UserWarning, match=msg1): + assert b._check_alpha() == 0 b = BernoulliNB(alpha=0, force_alpha=False) - assert_warns_message(UserWarning, msg2, b._check_alpha) - assert b._check_alpha() == _ALPHA_MIN + with pytest.warns(UserWarning, match=msg2): + assert b._check_alpha() == _ALPHA_MIN b = BernoulliNB(alpha=0) - assert_warns_message(UserWarning, msg2, b._check_alpha) - assert b._check_alpha() == _ALPHA_MIN + with pytest.warns(UserWarning, match=msg2): + assert b._check_alpha() == _ALPHA_MIN def test_alpha_vector(): From e7a5f37906e8f9d4f0589684147c99b1687c83e4 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 17 Jun 2021 15:48:26 -0400 Subject: [PATCH 18/20] MAINT Adds target_version to black config (#20293) --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 613d53e25d295..b312612236080 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ requires = [ [tool.black] line-length = 88 +target_version = ['py37', 'py38', 'py39'] exclude = ''' /( \.eggs # exclude a few common directories in the From 98c0c126081a290cbbdb8290375f48a50f85856d Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Wed, 23 Jun 2021 18:04:56 +0800 Subject: [PATCH 19/20] Black formatting Following https://github.com/scikit-learn/scikit-learn/issues/20301#issue-925041705 --- sklearn/naive_bayes.py | 223 ++++++++++-------- sklearn/tests/test_naive_bayes.py | 364 ++++++++++++++++-------------- 2 files changed, 323 insertions(+), 264 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 996419bda9a5b..dd30914df9cfd 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -34,8 +34,13 @@ from .utils.validation import _check_sample_weight -__all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB', - 'CategoricalNB'] +__all__ = [ + "BernoulliNB", + "GaussianNB", + "MultinomialNB", + "ComplementNB", + "CategoricalNB", +] class _BaseNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): @@ -220,8 +225,9 @@ def fit(self, X, y, sample_weight=None): self : object """ X, y = self._validate_data(X, y) - return self._partial_fit(X, y, np.unique(y), _refit=True, - sample_weight=sample_weight) + return self._partial_fit( + X, y, np.unique(y), _refit=True, sample_weight=sample_weight + ) def _check_X(self, X): """Validate X, used only in predict* methods.""" @@ -274,8 +280,7 @@ def _update_mean_variance(n_past, mu, var, X, sample_weight=None): if sample_weight is not None: n_new = float(sample_weight.sum()) new_mu = np.average(X, axis=0, weights=sample_weight) - new_var = np.average((X - new_mu) ** 2, axis=0, - weights=sample_weight) + new_var = np.average((X - new_mu) ** 2, axis=0, weights=sample_weight) else: n_new = X.shape[0] new_var = np.var(X, axis=0) @@ -295,8 +300,7 @@ def _update_mean_variance(n_past, mu, var, X, sample_weight=None): # the sum-of-squared-differences (ssd) old_ssd = n_past * var new_ssd = n_new * new_var - total_ssd = (old_ssd + new_ssd + - (n_new * n_past / n_total) * (mu - new_mu) ** 2) + total_ssd = old_ssd + new_ssd + (n_new * n_past / n_total) * (mu - new_mu) ** 2 total_var = total_ssd / n_total return total_mu, total_var @@ -340,11 +344,11 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): ------- self : object """ - return self._partial_fit(X, y, classes, _refit=False, - sample_weight=sample_weight) + return self._partial_fit( + X, y, classes, _refit=False, sample_weight=sample_weight + ) - def _partial_fit(self, X, y, classes=None, _refit=False, - sample_weight=None): + def _partial_fit(self, X, y, classes=None, _refit=False, sample_weight=None): """Actual implementation of Gaussian NB fitting. Parameters @@ -403,19 +407,19 @@ def _partial_fit(self, X, y, classes=None, _refit=False, priors = np.asarray(self.priors) # Check that the provide prior match the number of classes if len(priors) != n_classes: - raise ValueError('Number of priors must match number of' - ' classes.') + raise ValueError( + "Number of priors must match number of" " classes." + ) # Check that the sum is 1 if not np.isclose(priors.sum(), 1.0): - raise ValueError('The sum of the priors should be 1.') + raise ValueError("The sum of the priors should be 1.") # Check that the prior are non-negative if (priors < 0).any(): - raise ValueError('Priors must be non-negative.') + raise ValueError("Priors must be non-negative.") self.class_prior_ = priors else: # Initialize the priors to zeros for each class - self.class_prior_ = np.zeros(len(self.classes_), - dtype=np.float64) + self.class_prior_ = np.zeros(len(self.classes_), dtype=np.float64) else: if X.shape[1] != self.theta_.shape[1]: msg = "Number of features %d does not match previous data %d." @@ -429,9 +433,10 @@ def _partial_fit(self, X, y, classes=None, _refit=False, unique_y_in_classes = np.in1d(unique_y, classes) if not np.all(unique_y_in_classes): - raise ValueError("The target label(s) %s in y do not exist in the " - "initial classes %s" % - (unique_y[~unique_y_in_classes], classes)) + raise ValueError( + "The target label(s) %s in y do not exist in the " + "initial classes %s" % (unique_y[~unique_y_in_classes], classes) + ) for y_i in unique_y: i = classes.searchsorted(y_i) @@ -445,8 +450,8 @@ def _partial_fit(self, X, y, classes=None, _refit=False, N_i = X_i.shape[0] new_theta, new_sigma = self._update_mean_variance( - self.class_count_[i], self.theta_[i, :], self.var_[i, :], - X_i, sw_i) + self.class_count_[i], self.theta_[i, :], self.var_[i, :], X_i, sw_i + ) self.theta_[i, :] = new_theta self.var_[i, :] = new_sigma @@ -465,9 +470,8 @@ def _joint_log_likelihood(self, X): joint_log_likelihood = [] for i in range(np.size(self.classes_)): jointi = np.log(self.class_prior_[i]) - n_ij = - 0.5 * np.sum(np.log(2. * np.pi * self.var_[i, :])) - n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / - (self.var_[i, :]), 1) + n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :])) + n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1) joint_log_likelihood.append(jointi + n_ij) joint_log_likelihood = np.array(joint_log_likelihood).T @@ -496,18 +500,17 @@ class _BaseDiscreteNB(_BaseNB): def _check_X(self, X): """Validate X, used only in predict* methods.""" - return self._validate_data(X, accept_sparse='csr', reset=False) + return self._validate_data(X, accept_sparse="csr", reset=False) def _check_X_y(self, X, y, reset=True): """Validate X and y in fit methods.""" - return self._validate_data(X, y, accept_sparse='csr', reset=reset) + return self._validate_data(X, y, accept_sparse="csr", reset=reset) def _update_class_log_prior(self, class_prior=None): n_classes = len(self.classes_) if class_prior is not None: if len(class_prior) != n_classes: - raise ValueError("Number of priors must match number of" - " classes.") + raise ValueError("Number of priors must match number of" " classes.") self.class_log_prior_ = np.log(class_prior) elif self.fit_prior: with warnings.catch_warnings(): @@ -517,27 +520,34 @@ def _update_class_log_prior(self, class_prior=None): log_class_count = np.log(self.class_count_) # empirical prior, with sample_weight taken into account - self.class_log_prior_ = (log_class_count - - np.log(self.class_count_.sum())) + self.class_log_prior_ = log_class_count - np.log(self.class_count_.sum()) else: self.class_log_prior_ = np.full(n_classes, -np.log(n_classes)) def _check_alpha(self): if np.min(self.alpha) < 0: - raise ValueError('Smoothing parameter alpha = %.1e. ' - 'alpha should be > 0.' % np.min(self.alpha)) + raise ValueError( + "Smoothing parameter alpha = %.1e. " + "alpha should be > 0." % np.min(self.alpha) + ) if isinstance(self.alpha, np.ndarray): if not self.alpha.shape[0] == self.n_features_in_: - raise ValueError("alpha should be a scalar or a numpy array " - "with shape [n_features]") + raise ValueError( + "alpha should be a scalar or a numpy array " + "with shape [n_features]" + ) if np.min(self.alpha) < _ALPHA_MIN: if self.force_alpha: - warnings.warn('alpha too small will result in numeric errors. ' - 'Proceeding with alpha = %.1e, as ' - 'force_alpha was set to True.' % self.alpha) + warnings.warn( + "alpha too small will result in numeric errors. " + "Proceeding with alpha = %.1e, as " + "force_alpha was set to True." % self.alpha + ) else: - warnings.warn('alpha too small will result in numeric errors, ' - 'setting alpha = %.1e' % _ALPHA_MIN) + warnings.warn( + "alpha too small will result in numeric errors, " + "setting alpha = %.1e" % _ALPHA_MIN + ) return np.maximum(self.alpha, _ALPHA_MIN) return self.alpha @@ -591,7 +601,7 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): if Y.shape[1] == 1: if len(self.classes_) == 2: Y = np.concatenate((1 - Y, Y), axis=1) - else: # degenerate case: just one class + else: # degenerate case: just one class Y = np.ones_like(Y) if X.shape[0] != Y.shape[0]: @@ -649,7 +659,7 @@ def fit(self, X, y, sample_weight=None): if Y.shape[1] == 1: if len(self.classes_) == 2: Y = np.concatenate((1 - Y, Y), axis=1) - else: # degenerate case: just one class + else: # degenerate case: just one class Y = np.ones_like(Y) # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64. @@ -675,29 +685,36 @@ def fit(self, X, y, sample_weight=None): def _init_counters(self, n_classes, n_features): self.class_count_ = np.zeros(n_classes, dtype=np.float64) - self.feature_count_ = np.zeros((n_classes, n_features), - dtype=np.float64) + self.feature_count_ = np.zeros((n_classes, n_features), dtype=np.float64) # mypy error: Decorated property not supported @deprecated( # type: ignore "Attribute coef_ was deprecated in " - "version 0.24 and will be removed in 1.1 (renaming of 0.26).") + "version 0.24 and will be removed in 1.1 (renaming of 0.26)." + ) @property def coef_(self): - return (self.feature_log_prob_[1:] - if len(self.classes_) == 2 else self.feature_log_prob_) + return ( + self.feature_log_prob_[1:] + if len(self.classes_) == 2 + else self.feature_log_prob_ + ) # mypy error: Decorated property not supported @deprecated( # type: ignore "Attribute intercept_ was deprecated in " - "version 0.24 and will be removed in 1.1 (renaming of 0.26).") + "version 0.24 and will be removed in 1.1 (renaming of 0.26)." + ) @property def intercept_(self): - return (self.class_log_prior_[1:] - if len(self.classes_) == 2 else self.class_log_prior_) + return ( + self.class_log_prior_[1:] + if len(self.classes_) == 2 + else self.class_log_prior_ + ) def _more_tags(self): - return {'poor_score': True} + return {"poor_score": True} # TODO: Remove in 1.2 # mypy error: Decorated property not supported @@ -817,15 +834,16 @@ class MultinomialNB(_BaseDiscreteNB): https://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html """ - def __init__(self, *, alpha=1.0, force_alpha=False, - fit_prior=True, class_prior=None): + def __init__( + self, *, alpha=1.0, force_alpha=False, fit_prior=True, class_prior=None + ): self.alpha = alpha self.force_alpha = force_alpha self.fit_prior = fit_prior self.class_prior = class_prior def _more_tags(self): - return {'requires_positive_X': True} + return {"requires_positive_X": True} def _count(self, X, Y): """Count and smooth feature occurrences.""" @@ -838,13 +856,13 @@ def _update_feature_log_prob(self, alpha): smoothed_fc = self.feature_count_ + alpha smoothed_cc = smoothed_fc.sum(axis=1) - self.feature_log_prob_ = (np.log(smoothed_fc) - - np.log(smoothed_cc.reshape(-1, 1))) + self.feature_log_prob_ = np.log(smoothed_fc) - np.log( + smoothed_cc.reshape(-1, 1) + ) def _joint_log_likelihood(self, X): """Calculate the posterior log probability of the samples X""" - return (safe_sparse_dot(X, self.feature_log_prob_.T) + - self.class_log_prior_) + return safe_sparse_dot(X, self.feature_log_prob_.T) + self.class_log_prior_ class ComplementNB(_BaseDiscreteNB): @@ -956,8 +974,15 @@ class ComplementNB(_BaseDiscreteNB): https://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf """ - def __init__(self, *, alpha=1.0, force_alpha=False, fit_prior=True, - class_prior=None, norm=False): + def __init__( + self, + *, + alpha=1.0, + force_alpha=False, + fit_prior=True, + class_prior=None, + norm=False, + ): self.alpha = alpha self.force_alpha = force_alpha self.fit_prior = fit_prior @@ -965,7 +990,7 @@ def __init__(self, *, alpha=1.0, force_alpha=False, fit_prior=True, self.norm = norm def _more_tags(self): - return {'requires_positive_X': True} + return {"requires_positive_X": True} def _count(self, X, Y): """Count feature occurrences.""" @@ -1095,8 +1120,15 @@ class BernoulliNB(_BaseDiscreteNB): naive Bayes -- Which naive Bayes? 3rd Conf. on Email and Anti-Spam (CEAS). """ - def __init__(self, *, alpha=1.0, force_alpha=False, binarize=.0, - fit_prior=True, class_prior=None): + def __init__( + self, + *, + alpha=1.0, + force_alpha=False, + binarize=0.0, + fit_prior=True, + class_prior=None, + ): self.alpha = alpha self.force_alpha = force_alpha self.binarize = binarize @@ -1126,8 +1158,9 @@ def _update_feature_log_prob(self, alpha): smoothed_fc = self.feature_count_ + alpha smoothed_cc = self.class_count_ + alpha * 2 - self.feature_log_prob_ = (np.log(smoothed_fc) - - np.log(smoothed_cc.reshape(-1, 1))) + self.feature_log_prob_ = np.log(smoothed_fc) - np.log( + smoothed_cc.reshape(-1, 1) + ) def _joint_log_likelihood(self, X): """Calculate the posterior log probability of the samples X""" @@ -1135,8 +1168,10 @@ def _joint_log_likelihood(self, X): n_features_X = X.shape[1] if n_features_X != n_features: - raise ValueError("Expected input with %d features, got %d instead" - % (n_features, n_features_X)) + raise ValueError( + "Expected input with %d features, got %d instead" + % (n_features, n_features_X) + ) neg_prob = np.log(1 - np.exp(self.feature_log_prob_)) # Compute neg_prob · (1 - X).T as ∑neg_prob - X · neg_prob @@ -1242,9 +1277,15 @@ class CategoricalNB(_BaseDiscreteNB): [3] """ - def __init__(self, *, alpha=1.0, force_alpha=False, fit_prior=True, - class_prior=None, - min_categories=None): + def __init__( + self, + *, + alpha=1.0, + force_alpha=False, + fit_prior=True, + class_prior=None, + min_categories=None, + ): self.alpha = alpha self.force_alpha = force_alpha self.fit_prior = fit_prior @@ -1318,29 +1359,29 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): ------- self : object """ - return super().partial_fit(X, y, classes, - sample_weight=sample_weight) + return super().partial_fit(X, y, classes, sample_weight=sample_weight) def _more_tags(self): - return {'requires_positive_X': True} + return {"requires_positive_X": True} def _check_X(self, X): """Validate X, used only in predict* methods.""" - X = self._validate_data(X, dtype='int', accept_sparse=False, - force_all_finite=True, reset=False) + X = self._validate_data( + X, dtype="int", accept_sparse=False, force_all_finite=True, reset=False + ) check_non_negative(X, "CategoricalNB (input X)") return X def _check_X_y(self, X, y, reset=True): - X, y = self._validate_data(X, y, dtype='int', accept_sparse=False, - force_all_finite=True, reset=reset) + X, y = self._validate_data( + X, y, dtype="int", accept_sparse=False, force_all_finite=True, reset=reset + ) check_non_negative(X, "CategoricalNB (input X)") return X, y def _init_counters(self, n_classes, n_features): self.class_count_ = np.zeros(n_classes, dtype=np.float64) - self.category_count_ = [np.zeros((n_classes, 0)) - for _ in range(n_features)] + self.category_count_ = [np.zeros((n_classes, 0)) for _ in range(n_features)] @staticmethod def _validate_n_categories(X, min_categories): @@ -1353,9 +1394,7 @@ def _validate_n_categories(X, min_categories): f"'min_categories' should have integral type. Got " f"{min_categories_.dtype} instead." ) - n_categories_ = np.maximum(n_categories_X, - min_categories_, - dtype=np.int64) + n_categories_ = np.maximum(n_categories_X, min_categories_, dtype=np.int64) if n_categories_.shape != n_categories_X.shape: raise ValueError( f"'min_categories' should have shape ({X.shape[1]}," @@ -1371,7 +1410,7 @@ def _update_cat_count_dims(cat_count, highest_feature): diff = highest_feature + 1 - cat_count.shape[1] if diff > 0: # we append a column full of zeros for each new category - return np.pad(cat_count, [(0, 0), (0, diff)], 'constant') + return np.pad(cat_count, [(0, 0), (0, diff)], "constant") return cat_count def _update_cat_count(X_feature, Y, cat_count, n_classes): @@ -1386,15 +1425,15 @@ def _update_cat_count(X_feature, Y, cat_count, n_classes): cat_count[j, indices] += counts[indices] self.class_count_ += Y.sum(axis=0) - self.n_categories_ = self._validate_n_categories( - X, self.min_categories) + self.n_categories_ = self._validate_n_categories(X, self.min_categories) for i in range(self.n_features_in_): X_feature = X[:, i] self.category_count_[i] = _update_cat_count_dims( - self.category_count_[i], self.n_categories_[i] - 1) - _update_cat_count(X_feature, Y, - self.category_count_[i], - self.class_count_.shape[0]) + self.category_count_[i], self.n_categories_[i] - 1 + ) + _update_cat_count( + X_feature, Y, self.category_count_[i], self.class_count_.shape[0] + ) def _update_feature_log_prob(self, alpha): feature_log_prob = [] @@ -1402,8 +1441,8 @@ def _update_feature_log_prob(self, alpha): smoothed_cat_count = self.category_count_[i] + alpha smoothed_class_count = smoothed_cat_count.sum(axis=1) feature_log_prob.append( - np.log(smoothed_cat_count) - - np.log(smoothed_class_count.reshape(-1, 1))) + np.log(smoothed_cat_count) - np.log(smoothed_class_count.reshape(-1, 1)) + ) self.feature_log_prob_ = feature_log_prob def _joint_log_likelihood(self, X): diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 6fc96c3b9fae5..649867b446798 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -18,8 +18,7 @@ from sklearn.naive_bayes import MultinomialNB, ComplementNB from sklearn.naive_bayes import CategoricalNB -DISCRETE_NAIVE_BAYES_CLASSES = [ - BernoulliNB, CategoricalNB, ComplementNB, MultinomialNB] +DISCRETE_NAIVE_BAYES_CLASSES = [BernoulliNB, CategoricalNB, ComplementNB, MultinomialNB] ALL_NAIVE_BAYES_CLASSES = DISCRETE_NAIVE_BAYES_CLASSES + [GaussianNB] @@ -55,8 +54,7 @@ def test_gnb(): # an Error # FIXME Remove this test once the more general partial_fit tests are merged with pytest.raises( - ValueError, - match="The target label.* in y do not exist in the initial classes" + ValueError, match="The target label.* in y do not exist in the initial classes" ): GaussianNB().partial_fit(X, y, classes=[0, 1]) @@ -73,15 +71,14 @@ def test_gnb_var(): def test_gnb_prior(): # Test whether class priors are properly set. clf = GaussianNB().fit(X, y) - assert_array_almost_equal(np.array([3, 3]) / 6.0, - clf.class_prior_, 8) + assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8) clf = GaussianNB().fit(X1, y1) # Check that the class priors sum to 1 assert_array_almost_equal(clf.class_prior_.sum(), 1) def test_gnb_sample_weight(): - """Test whether sample weights are properly used in GNB. """ + """Test whether sample weights are properly used in GNB.""" # Sample weights all being 1 should not change results sw = np.ones(6) clf = GaussianNB().fit(X, y) @@ -114,9 +111,9 @@ def test_gnb_sample_weight(): def test_gnb_neg_priors(): """Test whether an error is raised in case of negative priors""" - clf = GaussianNB(priors=np.array([-1., 2.])) + clf = GaussianNB(priors=np.array([-1.0, 2.0])) - msg = 'Priors must be non-negative' + msg = "Priors must be non-negative" with pytest.raises(ValueError, match=msg): clf.fit(X, y) @@ -124,18 +121,31 @@ def test_gnb_neg_priors(): def test_gnb_priors(): """Test whether the class prior override is properly used""" clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y) - assert_array_almost_equal(clf.predict_proba([[-0.1, -0.1]]), - np.array([[0.825303662161683, - 0.174696337838317]]), 8) + assert_array_almost_equal( + clf.predict_proba([[-0.1, -0.1]]), + np.array([[0.825303662161683, 0.174696337838317]]), + 8, + ) assert_array_almost_equal(clf.class_prior_, np.array([0.3, 0.7])) def test_gnb_priors_sum_isclose(): # test whether the class prior sum is properly tested""" - X = np.array([[-1, -1], [-2, -1], [-3, -2], [-4, -5], [-5, -4], - [1, 1], [2, 1], [3, 2], [4, 4], [5, 5]]) - priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14, - 0.11, 0.0]) + X = np.array( + [ + [-1, -1], + [-2, -1], + [-3, -2], + [-4, -5], + [-5, -4], + [1, 1], + [2, 1], + [3, 2], + [4, 4], + [5, 5], + ] + ) + priors = np.array([0.08, 0.14, 0.03, 0.16, 0.11, 0.16, 0.07, 0.14, 0.11, 0.0]) Y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) clf = GaussianNB(priors=priors) # smoke test for issue #9633 @@ -143,20 +153,20 @@ def test_gnb_priors_sum_isclose(): def test_gnb_wrong_nb_priors(): - """ Test whether an error is raised if the number of prior is different + """Test whether an error is raised if the number of prior is different from the number of class""" - clf = GaussianNB(priors=np.array([.25, .25, .25, .25])) + clf = GaussianNB(priors=np.array([0.25, 0.25, 0.25, 0.25])) - msg = 'Number of priors must match number of classes' + msg = "Number of priors must match number of classes" with pytest.raises(ValueError, match=msg): clf.fit(X, y) def test_gnb_prior_greater_one(): """Test if an error is raised if the sum of prior greater than one""" - clf = GaussianNB(priors=np.array([2., 1.])) + clf = GaussianNB(priors=np.array([2.0, 1.0])) - msg = 'The sum of the priors should be 1' + msg = "The sum of the priors should be 1" with pytest.raises(ValueError, match=msg): clf.fit(X, y) @@ -169,14 +179,13 @@ def test_gnb_prior_large_bias(): def test_gnb_check_update_with_no_data(): - """ Test when the partial fit is called without any data""" + """Test when the partial fit is called without any data""" # Create an empty array prev_points = 100 - mean = 0. - var = 1. + mean = 0.0 + var = 1.0 x_empty = np.empty((0, X.shape[1])) - tmean, tvar = GaussianNB._update_mean_variance(prev_points, mean, - var, x_empty) + tmean, tvar = GaussianNB._update_mean_variance(prev_points, mean, var, x_empty) assert tmean == mean assert tvar == var @@ -199,14 +208,13 @@ def test_gnb_naive_bayes_scale_invariance(): # Scaling the data should not change the prediction results iris = load_iris() X, y = iris.data, iris.target - labels = [GaussianNB().fit(f * X, y).predict(f * X) - for f in [1E-10, 1, 1E10]] + labels = [GaussianNB().fit(f * X, y).predict(f * X) for f in [1e-10, 1, 1e10]] assert_array_equal(labels[0], labels[1]) assert_array_equal(labels[1], labels[2]) # TODO: Remove in version 1.1 -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) def test_discretenb_deprecated_coef_intercept(DiscreteNaiveBayes): est = DiscreteNaiveBayes().fit(X2, y2) @@ -215,15 +223,16 @@ def test_discretenb_deprecated_coef_intercept(DiscreteNaiveBayes): hasattr(est, att) -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) def test_discretenb_prior(DiscreteNaiveBayes): # Test whether class priors are properly set. clf = DiscreteNaiveBayes().fit(X2, y2) - assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0), - clf.class_log_prior_, 8) + assert_array_almost_equal( + np.log(np.array([2, 2, 2]) / 6.0), clf.class_log_prior_, 8 + ) -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) def test_discretenb_partial_fit(DiscreteNaiveBayes): clf1 = DiscreteNaiveBayes() clf1.fit([[0, 1], [1, 0], [1, 1]], [0, 1, 1]) @@ -233,8 +242,7 @@ def test_discretenb_partial_fit(DiscreteNaiveBayes): assert_array_equal(clf1.class_count_, clf2.class_count_) if DiscreteNaiveBayes is CategoricalNB: for i in range(len(clf1.category_count_)): - assert_array_equal(clf1.category_count_[i], - clf2.category_count_[i]) + assert_array_equal(clf1.category_count_[i], clf2.category_count_[i]) else: assert_array_equal(clf1.feature_count_, clf2.feature_count_) @@ -249,10 +257,13 @@ def test_discretenb_partial_fit(DiscreteNaiveBayes): # index chronologically with each call of partial fit and therefore # the category_count matrices cannot be compared for equality for i in range(len(clf1.category_count_)): - assert_array_equal(clf1.category_count_[i].shape, - clf3.category_count_[i].shape) - assert_array_equal(np.sum(clf1.category_count_[i], axis=1), - np.sum(clf3.category_count_[i], axis=1)) + assert_array_equal( + clf1.category_count_[i].shape, clf3.category_count_[i].shape + ) + assert_array_equal( + np.sum(clf1.category_count_[i], axis=1), + np.sum(clf3.category_count_[i], axis=1), + ) # assert category 0 occurs 1x in the first class and 0x in the 2nd # class @@ -271,12 +282,11 @@ def test_discretenb_partial_fit(DiscreteNaiveBayes): assert_array_equal(clf1.feature_count_, clf3.feature_count_) -@pytest.mark.parametrize('NaiveBayes', ALL_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("NaiveBayes", ALL_NAIVE_BAYES_CLASSES) def test_NB_partial_fit_no_first_classes(NaiveBayes): # classes is required for first call to partial fit with pytest.raises( - ValueError, - match="classes must be passed on the first call to partial_fit." + ValueError, match="classes must be passed on the first call to partial_fit." ): NaiveBayes().partial_fit(X2, y2) @@ -284,8 +294,7 @@ def test_NB_partial_fit_no_first_classes(NaiveBayes): clf = NaiveBayes() clf.partial_fit(X2, y2, classes=np.unique(y2)) with pytest.raises( - ValueError, - match="is not the same as on last call to partial_fit" + ValueError, match="is not the same as on last call to partial_fit" ): clf.partial_fit(X2, y2, classes=np.arange(42)) @@ -302,18 +311,21 @@ def test_discretenb_predict_proba(): # test binary case (1-d output) y = [0, 0, 2] # 2 is regression test for binary case, 02e673 - for DiscreteNaiveBayes, X in zip([BernoulliNB, MultinomialNB], - [X_bernoulli, X_multinomial]): + for DiscreteNaiveBayes, X in zip( + [BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial] + ): clf = DiscreteNaiveBayes().fit(X, y) assert clf.predict(X[-1:]) == 2 assert clf.predict_proba([X[0]]).shape == (1, 2) - assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1), - np.array([1., 1.]), 6) + assert_array_almost_equal( + clf.predict_proba(X[:2]).sum(axis=1), np.array([1.0, 1.0]), 6 + ) # test multiclass case (2-d output, must sum to one) y = [0, 1, 2] - for DiscreteNaiveBayes, X in zip([BernoulliNB, MultinomialNB], - [X_bernoulli, X_multinomial]): + for DiscreteNaiveBayes, X in zip( + [BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial] + ): clf = DiscreteNaiveBayes().fit(X, y) assert clf.predict_proba(X[0:1]).shape == (1, 3) assert clf.predict_proba(X[:2]).shape == (2, 3) @@ -323,7 +335,7 @@ def test_discretenb_predict_proba(): assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1) -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) def test_discretenb_uniform_prior(DiscreteNaiveBayes): # Test whether discrete NB classes fit a uniform prior # when fit_prior=False and class_prior=None @@ -332,49 +344,50 @@ def test_discretenb_uniform_prior(DiscreteNaiveBayes): clf.set_params(fit_prior=False) clf.fit([[0], [0], [1]], [0, 0, 1]) prior = np.exp(clf.class_log_prior_) - assert_array_almost_equal(prior, np.array([.5, .5])) + assert_array_almost_equal(prior, np.array([0.5, 0.5])) -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) def test_discretenb_provide_prior(DiscreteNaiveBayes): # Test whether discrete NB classes use provided prior clf = DiscreteNaiveBayes(class_prior=[0.5, 0.5]) clf.fit([[0], [0], [1]], [0, 0, 1]) prior = np.exp(clf.class_log_prior_) - assert_array_almost_equal(prior, np.array([.5, .5])) + assert_array_almost_equal(prior, np.array([0.5, 0.5])) # Inconsistent number of classes with prior - msg = 'Number of priors must match number of classes' + msg = "Number of priors must match number of classes" with pytest.raises(ValueError, match=msg): clf.fit([[0], [1], [2]], [0, 1, 2]) - msg = 'is not the same as on last call to partial_fit' + msg = "is not the same as on last call to partial_fit" with pytest.raises(ValueError, match=msg): clf.partial_fit([[0], [1]], [0, 1], classes=[0, 1, 1]) -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) def test_discretenb_provide_prior_with_partial_fit(DiscreteNaiveBayes): # Test whether discrete NB classes use provided prior # when using partial_fit iris = load_iris() iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split( - iris.data, iris.target, test_size=0.4, random_state=415) + iris.data, iris.target, test_size=0.4, random_state=415 + ) for prior in [None, [0.3, 0.3, 0.4]]: clf_full = DiscreteNaiveBayes(class_prior=prior) clf_full.fit(iris.data, iris.target) clf_partial = DiscreteNaiveBayes(class_prior=prior) - clf_partial.partial_fit(iris_data1, iris_target1, - classes=[0, 1, 2]) + clf_partial.partial_fit(iris_data1, iris_target1, classes=[0, 1, 2]) clf_partial.partial_fit(iris_data2, iris_target2) - assert_array_almost_equal(clf_full.class_log_prior_, - clf_partial.class_log_prior_) + assert_array_almost_equal( + clf_full.class_log_prior_, clf_partial.class_log_prior_ + ) -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) def test_discretenb_sample_weight_multiclass(DiscreteNaiveBayes): # check shape consistency for number of samples at fit time X = [ @@ -391,8 +404,7 @@ def test_discretenb_sample_weight_multiclass(DiscreteNaiveBayes): # Check sample weight using the partial_fit method clf = DiscreteNaiveBayes() - clf.partial_fit(X[:2], y[:2], classes=[0, 1, 2], - sample_weight=sample_weight[:2]) + clf.partial_fit(X[:2], y[:2], classes=[0, 1, 2], sample_weight=sample_weight[:2]) clf.partial_fit(X[2:3], y[2:3], sample_weight=sample_weight[2:3]) clf.partial_fit(X[3:], y[3:], sample_weight=sample_weight[3:]) assert_array_equal(clf.predict(X), [0, 1, 1, 2]) @@ -400,8 +412,9 @@ def test_discretenb_sample_weight_multiclass(DiscreteNaiveBayes): # TODO: Remove in version 1.1 @ignore_warnings(category=FutureWarning) -@pytest.mark.parametrize('DiscreteNaiveBayes', [BernoulliNB, ComplementNB, - MultinomialNB]) +@pytest.mark.parametrize( + "DiscreteNaiveBayes", [BernoulliNB, ComplementNB, MultinomialNB] +) def test_discretenb_coef_intercept_shape(DiscreteNaiveBayes): # coef_ and intercept_ should have shapes as in other linear models. # Non-regression test for issue #2127. @@ -414,13 +427,13 @@ def test_discretenb_coef_intercept_shape(DiscreteNaiveBayes): assert clf.intercept_.shape == (1,) -@pytest.mark.parametrize('DiscreteNaiveBayes', DISCRETE_NAIVE_BAYES_CLASSES) -@pytest.mark.parametrize('use_partial_fit', [False, True]) -@pytest.mark.parametrize('train_on_single_class_y', [False, True]) +@pytest.mark.parametrize("DiscreteNaiveBayes", DISCRETE_NAIVE_BAYES_CLASSES) +@pytest.mark.parametrize("use_partial_fit", [False, True]) +@pytest.mark.parametrize("train_on_single_class_y", [False, True]) def test_discretenb_degenerate_one_class_case( - DiscreteNaiveBayes, - use_partial_fit, - train_on_single_class_y, + DiscreteNaiveBayes, + use_partial_fit, + train_on_single_class_y, ): # Most array attributes of a discrete naive Bayes classifier should have a # first-axis length equal to the number of classes. Exceptions include: @@ -448,11 +461,11 @@ def test_discretenb_degenerate_one_class_case( # Check that attributes have expected first-axis lengths attribute_names = [ - 'classes_', - 'class_count_', - 'class_log_prior_', - 'feature_count_', - 'feature_log_prob_', + "classes_", + "class_count_", + "class_log_prior_", + "feature_count_", + "feature_log_prob_", ] for attribute_name in attribute_names: attribute = getattr(clf, attribute_name, None) @@ -467,21 +480,21 @@ def test_discretenb_degenerate_one_class_case( assert element.shape[0] == num_classes -@pytest.mark.parametrize('kind', ('dense', 'sparse')) +@pytest.mark.parametrize("kind", ("dense", "sparse")) def test_mnnb(kind): # Test Multinomial Naive Bayes classification. # This checks that MultinomialNB implements fit and predict and returns # correct values for a simple toy dataset. - if kind == 'dense': + if kind == "dense": X = X2 - elif kind == 'sparse': + elif kind == "sparse": X = scipy.sparse.csr_matrix(X2) # Check the ability to predict the learning set. clf = MultinomialNB() - msg = 'Negative values in data passed to' + msg = "Negative values in data passed to" with pytest.raises(ValueError, match=msg): clf.fit(-X, y2) y_pred = clf.fit(X, y2).predict(X) @@ -553,13 +566,10 @@ def test_mnb_prior_unobserved_targets(): @ignore_warnings(category=FutureWarning) def test_mnb_sample_weight(): clf = MultinomialNB() - clf.fit([[1, 2], [1, 2], [1, 0]], - [0, 0, 1], - sample_weight=[1, 1, 4]) + clf.fit([[1, 2], [1, 2], [1, 0]], [0, 0, 1], sample_weight=[1, 1, 4]) assert_array_equal(clf.predict([[1, 0]]), [1]) positive_prior = np.exp(clf.intercept_[0]) - assert_array_almost_equal([1 - positive_prior, positive_prior], - [1 / 3., 2 / 3.]) + assert_array_almost_equal([1 - positive_prior, positive_prior], [1 / 3.0, 2 / 3.0]) def test_bnb(): @@ -575,10 +585,9 @@ def test_bnb(): # Tokyo Japan Chinese (class: Japan) # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo - X = np.array([[1, 1, 0, 0, 0, 0], - [0, 1, 0, 0, 1, 0], - [0, 1, 0, 1, 0, 0], - [0, 1, 1, 0, 0, 1]]) + X = np.array( + [[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]] + ) # Classes are China (0), Japan (1) Y = np.array([0, 0, 0, 1]) @@ -592,9 +601,12 @@ def test_bnb(): assert_array_almost_equal(np.exp(clf.class_log_prior_), class_prior) # Check the feature probabilities are correct - feature_prob = np.array([[0.4, 0.8, 0.2, 0.4, 0.4, 0.2], - [1 / 3.0, 2 / 3.0, 2 / 3.0, 1 / 3.0, 1 / 3.0, - 2 / 3.0]]) + feature_prob = np.array( + [ + [0.4, 0.8, 0.2, 0.4, 0.4, 0.2], + [1 / 3.0, 2 / 3.0, 2 / 3.0, 1 / 3.0, 1 / 3.0, 2 / 3.0], + ] + ) assert_array_almost_equal(np.exp(clf.feature_log_prob_), feature_prob) # Testing data point is: @@ -602,8 +614,7 @@ def test_bnb(): X_test = np.array([[0, 1, 1, 0, 0, 1]]) # Check the predictive probabilities are correct - unnorm_predict_proba = np.array([[0.005183999999999999, - 0.02194787379972565]]) + unnorm_predict_proba = np.array([[0.005183999999999999, 0.02194787379972565]]) predict_proba = unnorm_predict_proba / np.sum(unnorm_predict_proba) assert_array_almost_equal(clf.predict_proba(X_test), predict_proba) @@ -643,33 +654,35 @@ def test_cnb(): # Tokyo Japan Chinese (class: Japan) # Features are Beijing, Chinese, Japan, Macao, Shanghai, and Tokyo. - X = np.array([[1, 1, 0, 0, 0, 0], - [0, 1, 0, 0, 1, 0], - [0, 1, 0, 1, 0, 0], - [0, 1, 1, 0, 0, 1]]) + X = np.array( + [[1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 1, 0], [0, 1, 0, 1, 0, 0], [0, 1, 1, 0, 0, 1]] + ) # Classes are China (0), Japan (1). Y = np.array([0, 0, 0, 1]) # Check that weights are correct. See steps 4-6 in Table 4 of # Rennie et al. (2003). - theta = np.array([ - [ - (0 + 1) / (3 + 6), - (1 + 1) / (3 + 6), - (1 + 1) / (3 + 6), - (0 + 1) / (3 + 6), - (0 + 1) / (3 + 6), - (1 + 1) / (3 + 6) - ], + theta = np.array( [ - (1 + 1) / (6 + 6), - (3 + 1) / (6 + 6), - (0 + 1) / (6 + 6), - (1 + 1) / (6 + 6), - (1 + 1) / (6 + 6), - (0 + 1) / (6 + 6) - ]]) + [ + (0 + 1) / (3 + 6), + (1 + 1) / (3 + 6), + (1 + 1) / (3 + 6), + (0 + 1) / (3 + 6), + (0 + 1) / (3 + 6), + (1 + 1) / (3 + 6), + ], + [ + (1 + 1) / (6 + 6), + (3 + 1) / (6 + 6), + (0 + 1) / (6 + 6), + (1 + 1) / (6 + 6), + (1 + 1) / (6 + 6), + (0 + 1) / (6 + 6), + ], + ] + ) weights = np.zeros(theta.shape) normed_weights = np.zeros(theta.shape) @@ -680,7 +693,7 @@ def test_cnb(): # Verify inputs are nonnegative. clf = ComplementNB(alpha=1.0) - msg = re.escape('Negative values in data passed to ComplementNB (input X)') + msg = re.escape("Negative values in data passed to ComplementNB (input X)") with pytest.raises(ValueError, match=msg): clf.fit(-X, Y) @@ -716,9 +729,7 @@ def test_categoricalnb(): # Check error is raised for X with negative entries X = np.array([[0, -1]]) y = np.array([1]) - error_msg = re.escape( - "Negative values in data passed to CategoricalNB (input X)" - ) + error_msg = re.escape("Negative values in data passed to CategoricalNB (input X)") with pytest.raises(ValueError, match=error_msg): clf.predict(X) with pytest.raises(ValueError, match=error_msg): @@ -728,10 +739,11 @@ def test_categoricalnb(): X3_test = np.array([[2, 5]]) # alpha=1 increases the count of all categories by one so the final # probability for each category is not 50/50 but 1/3 to 2/3 - bayes_numerator = np.array([[1/3*1/3, 2/3*2/3]]) + bayes_numerator = np.array([[1 / 3 * 1 / 3, 2 / 3 * 2 / 3]]) bayes_denominator = bayes_numerator.sum() - assert_array_almost_equal(clf.predict_proba(X3_test), - bayes_numerator / bayes_denominator) + assert_array_almost_equal( + clf.predict_proba(X3_test), bayes_numerator / bayes_denominator + ) # Assert category_count has counted all features assert len(clf.category_count_) == X3.shape[1] @@ -744,7 +756,7 @@ def test_categoricalnb(): assert_array_equal(clf.predict(np.array([[0, 0]])), np.array([1])) assert_array_equal(clf.n_categories_, np.array([2, 2])) - for factor in [1., 0.3, 5, 0.0001]: + for factor in [1.0, 0.3, 5, 0.0001]: X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) sample_weight = np.array([1, 1, 10, 0.1]) * factor @@ -758,29 +770,41 @@ def test_categoricalnb(): "min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_", [ # check min_categories with int > observed categories - (3, np.array([[2, 0, 0], [1, 1, 0]]), np.array([[1, 1, 0], [1, 1, 0]]), - np.array([[0, 2]]), np.array([3, 3]), - ), + ( + 3, + np.array([[2, 0, 0], [1, 1, 0]]), + np.array([[1, 1, 0], [1, 1, 0]]), + np.array([[0, 2]]), + np.array([3, 3]), + ), # check with list input - ([3, 4], np.array([[2, 0, 0], [1, 1, 0]]), - np.array([[1, 1, 0, 0], [1, 1, 0, 0]]), np.array([[0, 3]]), - np.array([3, 4]), - ), + ( + [3, 4], + np.array([[2, 0, 0], [1, 1, 0]]), + np.array([[1, 1, 0, 0], [1, 1, 0, 0]]), + np.array([[0, 3]]), + np.array([3, 4]), + ), # check min_categories with min less than actual - ([1, np.array([[2, 0], [1, 1]]), np.array([[1, 1], [1, 1]]), - np.array([[0, 1]]), np.array([2, 2])] - ), - ] + ( + [ + 1, + np.array([[2, 0], [1, 1]]), + np.array([[1, 1], [1, 1]]), + np.array([[0, 1]]), + np.array([2, 2]), + ] + ), + ], ) -def test_categoricalnb_with_min_categories(min_categories, exp_X1_count, - exp_X2_count, new_X, - exp_n_categories_): +def test_categoricalnb_with_min_categories( + min_categories, exp_X1_count, exp_X2_count, new_X, exp_n_categories_ +): X_n_categories = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y_n_categories = np.array([1, 1, 2, 2]) expected_prediction = np.array([1]) - clf = CategoricalNB(alpha=1, fit_prior=False, - min_categories=min_categories) + clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) clf.fit(X_n_categories, y_n_categories) X1_count, X2_count = clf.category_count_ assert_array_equal(X1_count, exp_X1_count) @@ -793,18 +817,17 @@ def test_categoricalnb_with_min_categories(min_categories, exp_X1_count, @pytest.mark.parametrize( "min_categories, error_msg", [ - ('bad_arg', "'min_categories' should have integral"), + ("bad_arg", "'min_categories' should have integral"), ([[3, 2], [2, 4]], "'min_categories' should have shape"), - (1., "'min_categories' should have integral"), - ] + (1.0, "'min_categories' should have integral"), + ], ) def test_categoricalnb_min_categories_errors(min_categories, error_msg): X = np.array([[0, 0], [0, 1], [0, 0], [1, 1]]) y = np.array([1, 1, 2, 2]) - clf = CategoricalNB(alpha=1, fit_prior=False, - min_categories=min_categories) + clf = CategoricalNB(alpha=1, fit_prior=False, min_categories=min_categories) with pytest.raises(ValueError, match=error_msg): clf.fit(X, y) @@ -813,11 +836,8 @@ def test_alpha(): # Setting alpha=0 should not output nan results when p(x_i|y_j)=0 is a case X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) - nb = BernoulliNB(alpha=0.) - msg = ( - "alpha too small will result in numeric errors," - " setting alpha = 1.0e-10" - ) + nb = BernoulliNB(alpha=0.0) + msg = "alpha too small will result in numeric errors," " setting alpha = 1.0e-10" with pytest.warns(UserWarning, match=msg): nb.partial_fit(X, y, classes=[0, 1]) with pytest.warns(UserWarning, match=msg): @@ -825,39 +845,39 @@ def test_alpha(): prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) - nb = MultinomialNB(alpha=0.) + nb = MultinomialNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.partial_fit(X, y, classes=[0, 1]) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) - prob = np.array([[2. / 3, 1. / 3], [0, 1]]) + prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) - nb = CategoricalNB(alpha=0.) + nb = CategoricalNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) - prob = np.array([[1., 0.], [0., 1.]]) + prob = np.array([[1.0, 0.0], [0.0, 1.0]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test sparse X X = scipy.sparse.csr_matrix(X) - nb = BernoulliNB(alpha=0.) + nb = BernoulliNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) prob = np.array([[1, 0], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) - nb = MultinomialNB(alpha=0.) + nb = MultinomialNB(alpha=0.0) with pytest.warns(UserWarning, match=msg): nb.fit(X, y) - prob = np.array([[2. / 3, 1. / 3], [0, 1]]) + prob = np.array([[2.0 / 3, 1.0 / 3], [0, 1]]) assert_array_almost_equal(nb.predict_proba(X), prob) # Test for alpha < 0 X = np.array([[1, 0], [1, 1]]) y = np.array([0, 1]) expected_msg = re.escape( - 'Smoothing parameter alpha = -1.0e-01. alpha should be > 0.' + "Smoothing parameter alpha = -1.0e-01. alpha should be > 0." ) b_nb = BernoulliNB(alpha=-0.1) m_nb = MultinomialNB(alpha=-0.1) @@ -880,11 +900,15 @@ def test_alpha(): def test_check_alpha(): # Test force_alpha if alpha < _ALPHA_MIN _ALPHA_MIN = 1e-10 # const - msg1 = ('alpha too small will result in numeric errors. ' - 'Proceeding with alpha = .+, as ' - 'force_alpha was set to True.') - msg2 = ('alpha too small will result in numeric errors, ' - 'setting alpha = %.1e' % _ALPHA_MIN) + msg1 = ( + "alpha too small will result in numeric errors. " + "Proceeding with alpha = .+, as " + "force_alpha was set to True." + ) + msg2 = ( + "alpha too small will result in numeric errors, " + "setting alpha = %.1e" % _ALPHA_MIN + ) b = BernoulliNB(alpha=0, force_alpha=True) with pytest.warns(UserWarning, match=msg1): assert b._check_alpha() == 0 @@ -915,11 +939,9 @@ def test_alpha_vector(): assert_array_almost_equal(nb.predict_proba(X), prob) # Test alpha non-negative - alpha = np.array([1., -0.1]) + alpha = np.array([1.0, -0.1]) m_nb = MultinomialNB(alpha=alpha) - expected_msg = ( - 'Smoothing parameter alpha = -1.0e-01. alpha should be > 0.' - ) + expected_msg = "Smoothing parameter alpha = -1.0e-01. alpha should be > 0." with pytest.raises(ValueError, match=expected_msg): m_nb.fit(X, y) @@ -928,15 +950,13 @@ def test_alpha_vector(): alpha = np.array([ALPHA_MIN / 2, 0.5]) m_nb = MultinomialNB(alpha=alpha) m_nb.partial_fit(X, y, classes=[0, 1]) - assert_array_almost_equal(m_nb._check_alpha(), - [ALPHA_MIN, 0.5], - decimal=12) + assert_array_almost_equal(m_nb._check_alpha(), [ALPHA_MIN, 0.5], decimal=12) # Test correct dimensions - alpha = np.array([1., 2., 3.]) + alpha = np.array([1.0, 2.0, 3.0]) m_nb = MultinomialNB(alpha=alpha) expected_msg = re.escape( - 'alpha should be a scalar or a numpy array with shape [n_features]' + "alpha should be a scalar or a numpy array with shape [n_features]" ) with pytest.raises(ValueError, match=expected_msg): m_nb.fit(X, y) From 16af708fdd5faa167abb52453cc64e30611a452b Mon Sep 17 00:00:00 2001 From: Shao Yang Hong Date: Wed, 23 Jun 2021 21:27:17 +0800 Subject: [PATCH 20/20] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- sklearn/tests/test_naive_bayes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 649867b446798..969a34e889994 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -898,6 +898,8 @@ def test_alpha(): def test_check_alpha(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/10772 # Test force_alpha if alpha < _ALPHA_MIN _ALPHA_MIN = 1e-10 # const msg1 = (