From 7019abe4e39d1096ab8b78100763323a8ae8e5be Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 14 Jan 2020 08:13:32 +0800 Subject: [PATCH 01/61] Added general naive bayes --- sklearn/naive_bayes.py | 109 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index dd1d9586db6e1..5c60ccc8d6f28 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -117,6 +117,115 @@ def predict_proba(self, X): return np.exp(self.predict_log_proba(X)) + +class GeneralNB(_BaseNB): + """General Naive Bayes (GeneralNB) + + Parameters + ---------- + distributions : list of tuples + Prior probabilities of the classes. If specified the priors are not + adjusted according to the data. + + Attributes + ---------- + class_prior_ : array, shape (n_classes,) + probability of each class. + class_count_ : array, shape (n_classes,) + number of training samples observed in each class. + fits_ : list of objects + list of objects that inherit from BaseNB + + Examples + -------- + >>> import numpy as np + >>> X = np.array([[1.5, 2.3, 0, 1], + [2.7, 3.8, 1, 0], + [1.7, 0.1, 1, 0]]) + >>> y = np.array([1, 0, 0]) + >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB + >>> clf = GeneralNB([ + (GaussianNB(), [0, 1]), + (BernoulliNB(), [2, 3]) + ]) + >>> clf.fit(X, y) + GeneralNB(distributions=[ + (GaussianNB(priors=None, var_smoothing=1e-09), [0, 1]), + (BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, + fit_prior=True), [2, 3])] + ) + >>> print(clf.predict([[1.5, 2.3, 0, 1]])) + [1] + >>> print(clf.score([[2.7, 3.8, 1, 0]])) + [1] + """ + + def __init__(self, distributions): + self.distributions_ = distributions + self.class_prior = [] + self.fits = [] + + def __repr__(self): + return f"{str(self.__class__.__name__)}(distributions={self.distributions_})" + + def fit(self, X, y): + """Fit Gaussian Naive Bayes according to X, y + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training vectors, where n_samples is the number of samples + and n_features is the number of features. + y : array-like, shape (n_samples,) + Target values. + sample_weight : array-like, shape (n_samples,), optional (default=None) + Weights applied to individual samples (1. for unweighted). + + Returns + ------- + self : object + """ + self.distributions_ = self._check_distributions(self.distributions_) + + # FIXME aggregate all classes and all priors? + self.classes_ = np.unique(y) + + inits = [(nb, features) for (nb, features) in self.distributions_] + + self.fits = [(nb.fit(X[:, features], y), features) + for (nb, features) in inits] + + return self + + def _joint_log_likelihood(self, X): + """Calculate the posterior log probability of the samples X""" + log_priors = [nb.class_log_prior_ + if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) + for nb, _ in self.fits] + + + print(nb._joint_log_likelihood(X[:, self.fits[0][1]]) + jll = [nb._joint_log_likelihood(X[:, features]) + for (nb, features) in self.fits] + + jll = np.hstack([jll]) + jll = jll - log_priors[0] + jll = jll.sum(axis=0) + log_priors[0] + + return jll + + def _check_X(self, X): + # TODO + return X + + def _check_distributions(self, distr): + # TODO + # Check duplicate naive bayes algorithms + # Check duplicate rows + return distr + + + class GaussianNB(_BaseNB): """ Gaussian Naive Bayes (GaussianNB) From 4a15a069ffd53b430fd5ce62ed64eaee76a7e867 Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 20 Jan 2020 00:41:52 +0800 Subject: [PATCH 02/61] Update GeneralNB --- sklearn/naive_bayes.py | 57 ++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 5c60ccc8d6f28..c8ec372b0ec12 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -119,7 +119,7 @@ def predict_proba(self, X): class GeneralNB(_BaseNB): - """General Naive Bayes (GeneralNB) + """General Naive Bayes Parameters ---------- @@ -139,30 +139,28 @@ class GeneralNB(_BaseNB): Examples -------- >>> import numpy as np - >>> X = np.array([[1.5, 2.3, 0, 1], - [2.7, 3.8, 1, 0], - [1.7, 0.1, 1, 0]]) + >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], + [2.7, 3.8, 2.3, 1, 0], + [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB >>> clf = GeneralNB([ - (GaussianNB(), [0, 1]), - (BernoulliNB(), [2, 3]) + (GaussianNB(), [0, 1, 2]), + (BernoulliNB(), [3, 4]) ]) >>> clf.fit(X, y) GeneralNB(distributions=[ - (GaussianNB(priors=None, var_smoothing=1e-09), [0, 1]), - (BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, - fit_prior=True), [2, 3])] - ) - >>> print(clf.predict([[1.5, 2.3, 0, 1]])) + (GaussianNB(priors=None, var_smoothing=1e-09), [0, 1, 2]), + (BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, + fit_prior=True), [3, 4])]) + >>> print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) [1] - >>> print(clf.score([[2.7, 3.8, 1, 0]])) + >>> print(clf.score([[2.7, 3.8, 1, 0]],[1])) [1] """ def __init__(self, distributions): self.distributions_ = distributions - self.class_prior = [] self.fits = [] def __repr__(self): @@ -180,6 +178,8 @@ def fit(self, X, y): Target values. sample_weight : array-like, shape (n_samples,), optional (default=None) Weights applied to individual samples (1. for unweighted). + fits : list of (NB, feature) tuples + List of fitted NBs Returns ------- @@ -190,32 +190,41 @@ def fit(self, X, y): # FIXME aggregate all classes and all priors? self.classes_ = np.unique(y) - inits = [(nb, features) for (nb, features) in self.distributions_] + inits = [(nb,features) for (nb,features) in self.distributions_] - self.fits = [(nb.fit(X[:, features], y), features) - for (nb, features) in inits] + self.fits = [(nb.fit(X[:,features],y), features) + for (nb,features) in inits] return self def _joint_log_likelihood(self, X): """Calculate the posterior log probability of the samples X""" + + X = np.array(X) + + # For now assume all class log priors are the same for all the NB's + # So we'll take the first one. log_priors = [nb.class_log_prior_ if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) - for nb, _ in self.fits] + for (nb, _) in self.fits] + log_prior = log_priors[0] - - print(nb._joint_log_likelihood(X[:, self.fits[0][1]]) - jll = [nb._joint_log_likelihood(X[:, features]) + jlls = [nb._joint_log_likelihood(X[:, features]) for (nb, features) in self.fits] - jll = np.hstack([jll]) - jll = jll - log_priors[0] - jll = jll.sum(axis=0) + log_priors[0] + # jlls has the shape (distribution, sample, class) + jlls = np.hstack([jlls]) + + # Remove the class log prior from all the distributions + # but add it back after the summation + jlls = jlls - log_prior + jll = jlls.sum(axis=0) + log_prior return jll def _check_X(self, X): - # TODO + # Check for this and + # Check for every distribution return X def _check_distributions(self, distr): From 65f9ba99b370e31b118f54d202d7d5b8e6ebee55 Mon Sep 17 00:00:00 2001 From: "Raimi Karim (B4D2D7F7)" Date: Tue, 28 Jan 2020 18:52:39 +0800 Subject: [PATCH 03/61] Update docstring --- sklearn/naive_bayes.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index c8ec372b0ec12..692573cb2c391 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -16,10 +16,8 @@ # # License: BSD 3 clause import warnings - from abc import ABCMeta, abstractmethod - import numpy as np from .base import BaseEstimator, ClassifierMixin @@ -140,23 +138,18 @@ class GeneralNB(_BaseNB): -------- >>> import numpy as np >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], - [2.7, 3.8, 2.3, 1, 0], - [1.7, 0.1, 4.5, 1, 0]]) + ... [2.7, 3.8, 2.3, 1, 0], + ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB - >>> clf = GeneralNB([ - (GaussianNB(), [0, 1, 2]), - (BernoulliNB(), [3, 4]) - ]) + >>> clf = GeneralNB([(GaussianNB(), [0, 1, 2]), + ... (BernoulliNB(), [3, 4])]) >>> clf.fit(X, y) - GeneralNB(distributions=[ - (GaussianNB(priors=None, var_smoothing=1e-09), [0, 1, 2]), - (BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, - fit_prior=True), [3, 4])]) + GeneralNB(distributions=[(GaussianNB(), [0, 1, 2]), (BernoulliNB(), [3, 4])]) >>> print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) [1] - >>> print(clf.score([[2.7, 3.8, 1, 0]],[1])) - [1] + >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) + 1.0 """ def __init__(self, distributions): From a7e9b1e96600f5a64cd469a26b4f559d34b7b67e Mon Sep 17 00:00:00 2001 From: "Raimi Karim (B4D2D7F7)" Date: Tue, 28 Jan 2020 21:32:18 +0800 Subject: [PATCH 04/61] Added tests --- sklearn/naive_bayes.py | 84 +++++++++++++++++++++++-------- sklearn/tests/test_naive_bayes.py | 48 +++++++++++++++++- 2 files changed, 109 insertions(+), 23 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 692573cb2c391..db6e04c79bb0d 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -17,7 +17,7 @@ # License: BSD 3 clause import warnings from abc import ABCMeta, abstractmethod - +import copy import numpy as np from .base import BaseEstimator, ClassifierMixin @@ -32,7 +32,7 @@ from .utils.validation import _check_sample_weight __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB', - 'CategoricalNB'] + 'CategoricalNB', 'GeneralNB'] class _BaseNB(ClassifierMixin, BaseEstimator, metaclass=ABCMeta): @@ -115,22 +115,18 @@ def predict_proba(self, X): return np.exp(self.predict_log_proba(X)) - class GeneralNB(_BaseNB): """General Naive Bayes Parameters ---------- distributions : list of tuples - Prior probabilities of the classes. If specified the priors are not - adjusted according to the data. + A list of (NB, features) tuples, where NB is 'BernoulliNB', 'GaussianNB', + 'MultinomialNB', 'ComplementNB' or 'CategoricalNB', and features is + a list of indices. Attributes ---------- - class_prior_ : array, shape (n_classes,) - probability of each class. - class_count_ : array, shape (n_classes,) - number of training samples observed in each class. fits_ : list of objects list of objects that inherit from BaseNB @@ -178,15 +174,18 @@ def fit(self, X, y): ------- self : object """ - self.distributions_ = self._check_distributions(self.distributions_) + self._check_distributions(self.distributions_, X) + X, y = check_X_y(X, y) + y = column_or_1d(y, warn=True) + # FIXME aggregate all classes and all priors? self.classes_ = np.unique(y) - inits = [(nb,features) for (nb,features) in self.distributions_] + inits = [(nb, features) for (nb, features) in self.distributions_] - self.fits = [(nb.fit(X[:,features],y), features) - for (nb,features) in inits] + self.fits = [(nb.fit(X[:, features], y), features) + for (nb, features) in inits] return self @@ -203,7 +202,7 @@ def _joint_log_likelihood(self, X): log_prior = log_priors[0] jlls = [nb._joint_log_likelihood(X[:, features]) - for (nb, features) in self.fits] + for (nb, features) in self.fits] # jlls has the shape (distribution, sample, class) jlls = np.hstack([jlls]) @@ -216,16 +215,57 @@ def _joint_log_likelihood(self, X): return jll def _check_X(self, X): - # Check for this and - # Check for every distribution - return X + return check_array(X) + + def _check_distributions(self, distributions, X): + """Check validity of distributions + + Distributions should be explicitly specified + """ + valid_modules = copy.copy(__all__) + valid_modules.remove("GeneralNB") + dict_distribution = {} + + X = np.array(X) + num_cols_expected = X.shape[-1] + + # Check type + if not isinstance(distributions, list): + raise TypeError( + "Expected list but got {}".format(type(distributions))) + + # Check if all are sklearn classes + for distribution in distributions: + + if not isinstance(distribution, tuple): + raise TypeError( + "Expected tuple but got {}".format(type(distribution))) + + if len(distribution) != 2: + raise ValueError("Expected tuple to have length of 2 " + + "but got {}".format(len(distribution))) + + nb, features = distribution + + if callable(nb): + raise ValueError("Wrong format specified.") + if nb.__class__.__name__ not in valid_modules: + raise ValueError( + "Distributions should be one of {}".format(valid_modules)) + for feature in features: + if feature in dict_distribution: + raise ValueError( + "Duplicate specification of feature found.") + else: + dict_distribution[feature] = nb.__class__.__name__.lower() - def _check_distributions(self, distr): - # TODO - # Check duplicate naive bayes algorithms - # Check duplicate rows - return distr + num_cols = len(dict_distribution) + if num_cols != num_cols_expected: + raise ValueError("Expected {} features ".format(num_cols_expected) + + " to have specified distributions " + + "but only {} were specified.".format(num_cols)) + # Check inefficient specification? class GaussianNB(_BaseNB): diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index b4470a7ed49e5..47b96653aa8af 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -20,7 +20,7 @@ from sklearn.naive_bayes import GaussianNB, BernoulliNB from sklearn.naive_bayes import MultinomialNB, ComplementNB -from sklearn.naive_bayes import CategoricalNB +from sklearn.naive_bayes import CategoricalNB, GeneralNB from sklearn.naive_bayes import BaseNB, BaseDiscreteNB @@ -39,6 +39,52 @@ y2 = np.array([1, 1, 2, 2, 3, 3]) +def test_generalnb_correctness(): + X = np.array([[1.5, 2.3, 5.7, 0, 1], + [2.7, 3.8, 2.3, 1, 0], + [1.7, 0.1, 4.5, 1, 0]]) + y = np.array([1, 0, 0]) + clf = GeneralNB([(GaussianNB(), [0, 1, 2]), + (BernoulliNB(), [3, 4])]) + clf.fit(X, y) + print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) + print(clf.score([[2.7, 3.8, 1, 0, 1]], [0])) + + +def test_generalnb_input_distributions_not_enough(): + clf = GeneralNB([(GaussianNB(), [0])]) + assert_raises(ValueError, clf.fit, X, y) + + +def test_generalnb_input_distributions_duplicate(): + clf = GeneralNB([(GaussianNB(), [0, 1]), (GaussianNB(), [1])]) + assert_raises(ValueError, clf.fit, X, y) + + +def test_generalnb_input_distributions_unknown_distr(): + clf = GeneralNB([(GeneralNB((GaussianNB(), [0, 1])), [0, 1])]) + assert_raises(ValueError, clf.fit, X, y) + + +def test_generalnb_wrong_type(): + clf = GeneralNB([[GaussianNB(), [0, 1]]]) + assert_raises(TypeError, clf.fit, X, y) + + +def test_generalnb_tuple_too_long(): + clf = GeneralNB([(GaussianNB(), [0, 1], [3])]) + assert_raises(ValueError, clf.fit, X, y) + + +def test_generalnb_wrong_format(): + clf = GeneralNB([(GaussianNB, [0, 1])]) + assert_raises(ValueError, clf.fit, X, y) + + +def test_pickle(): + pass + + def test_gnb(): # Gaussian Naive Bayes classification. # This checks that GaussianNB implements fit and predict and returns From fd9c34b2256ef1299dd40d084218a5e5ed822561 Mon Sep 17 00:00:00 2001 From: "Raimi Karim (B4D2D7F7)" Date: Wed, 29 Jan 2020 01:53:44 +0800 Subject: [PATCH 05/61] Update tests --- sklearn/naive_bayes.py | 19 +++++++++---------- sklearn/tests/test_naive_bayes.py | 31 ++++++++++++++++--------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index db6e04c79bb0d..dde3e06882053 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -138,24 +138,23 @@ class GeneralNB(_BaseNB): ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB - >>> clf = GeneralNB([(GaussianNB(), [0, 1, 2]), + >>> clf = GeneralNB() + >>> clf.fit(X, y, [(GaussianNB(), [0, 1, 2]), ... (BernoulliNB(), [3, 4])]) - >>> clf.fit(X, y) - GeneralNB(distributions=[(GaussianNB(), [0, 1, 2]), (BernoulliNB(), [3, 4])]) + GeneralNB() >>> print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) [1] >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) 1.0 """ - def __init__(self, distributions): - self.distributions_ = distributions + def __init__(self): self.fits = [] - def __repr__(self): - return f"{str(self.__class__.__name__)}(distributions={self.distributions_})" + # def __repr__(self): + # return f"{str(self.__class__.__name__)}(distributions={self.distributions_})" - def fit(self, X, y): + def fit(self, X, y, distributions): """Fit Gaussian Naive Bayes according to X, y Parameters @@ -174,7 +173,7 @@ def fit(self, X, y): ------- self : object """ - self._check_distributions(self.distributions_, X) + self._check_distributions(distributions, X) X, y = check_X_y(X, y) y = column_or_1d(y, warn=True) @@ -182,7 +181,7 @@ def fit(self, X, y): # FIXME aggregate all classes and all priors? self.classes_ = np.unique(y) - inits = [(nb, features) for (nb, features) in self.distributions_] + inits = [(nb, features) for (nb, features) in distributions] self.fits = [(nb.fit(X[:, features], y), features) for (nb, features) in inits] diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 47b96653aa8af..af4f2549ea6a5 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -44,41 +44,42 @@ def test_generalnb_correctness(): [2.7, 3.8, 2.3, 1, 0], [1.7, 0.1, 4.5, 1, 0]]) y = np.array([1, 0, 0]) - clf = GeneralNB([(GaussianNB(), [0, 1, 2]), - (BernoulliNB(), [3, 4])]) - clf.fit(X, y) + clf = GeneralNB() + clf.fit(X, y, [(GaussianNB(), [0, 1, 2]), + (BernoulliNB(), [3, 4])]) print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) print(clf.score([[2.7, 3.8, 1, 0, 1]], [0])) def test_generalnb_input_distributions_not_enough(): - clf = GeneralNB([(GaussianNB(), [0])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [(GaussianNB(), [0])]) def test_generalnb_input_distributions_duplicate(): - clf = GeneralNB([(GaussianNB(), [0, 1]), (GaussianNB(), [1])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [ + (GaussianNB(), [0, 1]), (GaussianNB(), [1])]) def test_generalnb_input_distributions_unknown_distr(): - clf = GeneralNB([(GeneralNB((GaussianNB(), [0, 1])), [0, 1])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [(GeneralNB(),[0,1])]) def test_generalnb_wrong_type(): - clf = GeneralNB([[GaussianNB(), [0, 1]]]) - assert_raises(TypeError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(TypeError, clf.fit, X, y, [[GaussianNB(), [0, 1]]]) def test_generalnb_tuple_too_long(): - clf = GeneralNB([(GaussianNB(), [0, 1], [3])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [(GaussianNB(), [0, 1], [3])]) def test_generalnb_wrong_format(): - clf = GeneralNB([(GaussianNB, [0, 1])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [(GaussianNB, [0, 1])]) def test_pickle(): From 368a1ae31cb7c2aae5be1cec8f0748a9fda23ec4 Mon Sep 17 00:00:00 2001 From: "Raimi Karim (B4D2D7F7)" Date: Wed, 29 Jan 2020 21:38:58 +0800 Subject: [PATCH 06/61] Added docs stub --- doc/modules/naive_bayes.rst | 15 +++++++++++---- sklearn/naive_bayes.py | 14 +++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index 457ec6c630b99..01720ba4857ab 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -229,10 +229,10 @@ It is advisable to evaluate both models, if time permits. Categorical Naive Bayes ----------------------- -:class:`CategoricalNB` implements the categorical naive Bayes -algorithm for categorically distributed data. It assumes that each feature, -which is described by the index :math:`i`, has its own categorical -distribution. +:class:`CategoricalNB` implements the categorical naive Bayes +algorithm for categorically distributed data. It assumes that each feature, +which is described by the index :math:`i`, has its own categorical +distribution. For each feature :math:`i` in the training set :math:`X`, :class:`CategoricalNB` estimates a categorical distribution for each feature i @@ -259,6 +259,13 @@ categories for each feature :math:`i` are represented with numbers :math:`0, ..., n_i - 1` where :math:`n_i` is the number of available categories of feature :math:`i`. +.. _general_naive_bayes: + +General Naive Bayes +------------------- + +[WIP] + Out-of-core naive Bayes model fitting ------------------------------------- diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index dde3e06882053..25fdd333ffda0 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -118,6 +118,8 @@ def predict_proba(self, X): class GeneralNB(_BaseNB): """General Naive Bayes + Read more in the :ref:`User Guide `. + Parameters ---------- distributions : list of tuples @@ -140,7 +142,7 @@ class GeneralNB(_BaseNB): >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB >>> clf = GeneralNB() >>> clf.fit(X, y, [(GaussianNB(), [0, 1, 2]), - ... (BernoulliNB(), [3, 4])]) + ... (BernoulliNB(), [3, 4])]) GeneralNB() >>> print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) [1] @@ -151,9 +153,6 @@ class GeneralNB(_BaseNB): def __init__(self): self.fits = [] - # def __repr__(self): - # return f"{str(self.__class__.__name__)}(distributions={self.distributions_})" - def fit(self, X, y, distributions): """Fit Gaussian Naive Bayes according to X, y @@ -177,7 +176,6 @@ def fit(self, X, y, distributions): X, y = check_X_y(X, y) y = column_or_1d(y, warn=True) - # FIXME aggregate all classes and all priors? self.classes_ = np.unique(y) @@ -260,11 +258,9 @@ def _check_distributions(self, distributions, X): num_cols = len(dict_distribution) if num_cols != num_cols_expected: - raise ValueError("Expected {} features ".format(num_cols_expected) + + raise ValueError("Expected {} features".format(num_cols_expected) + " to have specified distributions " + - "but only {} were specified.".format(num_cols)) - - # Check inefficient specification? + "but {} were specified.".format(num_cols)) class GaussianNB(_BaseNB): From b633847e492f30d2fb96a8d614ddb792ce269693 Mon Sep 17 00:00:00 2001 From: "Raimi Karim (B4D2D7F7)" Date: Wed, 29 Jan 2020 21:44:27 +0800 Subject: [PATCH 07/61] Update doc --- sklearn/naive_bayes.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index bd4485e9c1d4e..39b3b8602fefa 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -120,17 +120,10 @@ class GeneralNB(_BaseNB): Read more in the :ref:`User Guide `. - Parameters - ---------- - distributions : list of tuples - A list of (NB, features) tuples, where NB is 'BernoulliNB', 'GaussianNB', - 'MultinomialNB', 'ComplementNB' or 'CategoricalNB', and features is - a list of indices. - Attributes ---------- fits_ : list of objects - list of objects that inherit from BaseNB + list of fitted classifiers Examples -------- @@ -151,7 +144,7 @@ class GeneralNB(_BaseNB): """ def __init__(self): - self.fits = [] + self.fits_ = [] def fit(self, X, y, distributions): """Fit Gaussian Naive Bayes according to X, y @@ -163,10 +156,10 @@ def fit(self, X, y, distributions): and n_features is the number of features. y : array-like, shape (n_samples,) Target values. - sample_weight : array-like, shape (n_samples,), optional (default=None) - Weights applied to individual samples (1. for unweighted). - fits : list of (NB, feature) tuples - List of fitted NBs + distributions : list of tuples + A list of (NB, features) tuples, where NB is 'BernoulliNB', 'GaussianNB', + 'MultinomialNB', 'ComplementNB' or 'CategoricalNB', and features is + a list of indices. Returns ------- @@ -181,7 +174,7 @@ def fit(self, X, y, distributions): inits = [(nb, features) for (nb, features) in distributions] - self.fits = [(nb.fit(X[:, features], y), features) + self.fits_ = [(nb.fit(X[:, features], y), features) for (nb, features) in inits] return self @@ -195,11 +188,11 @@ def _joint_log_likelihood(self, X): # So we'll take the first one. log_priors = [nb.class_log_prior_ if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) - for (nb, _) in self.fits] + for (nb, _) in self.fits_] log_prior = log_priors[0] jlls = [nb._joint_log_likelihood(X[:, features]) - for (nb, features) in self.fits] + for (nb, features) in self.fits_] # jlls has the shape (distribution, sample, class) jlls = np.hstack([jlls]) From 34339fec2c3a320b1071d78cd55275b76afc1b95 Mon Sep 17 00:00:00 2001 From: "Raimi Karim (B4D2D7F7)" Date: Wed, 29 Jan 2020 22:02:16 +0800 Subject: [PATCH 08/61] Update docs --- .github/workflows/assign.yml | 16 ++ .github/workflows/unassign.yml | 14 ++ doc/developers/advanced_installation.rst | 4 +- doc/developers/contributing.rst | 7 +- doc/developers/maintainer.rst | 164 +++++++++++++----- doc/modules/naive_bayes.rst | 15 +- doc/templates/index.html | 1 + doc/whats_new/v0.23.rst | 14 ++ .../wikipedia_principal_eigenvector.py | 6 - sklearn/cluster/_dbscan.py | 3 +- sklearn/ensemble/_bagging.py | 34 ++-- .../gradient_boosting.py | 8 +- .../_hist_gradient_boosting/grower.py | 7 +- .../tests/test_gradient_boosting.py | 3 +- .../tests/test_grower.py | 12 +- sklearn/ensemble/_iforest.py | 9 +- sklearn/naive_bayes.py | 59 +++---- .../neural_network/_multilayer_perceptron.py | 22 +-- sklearn/pipeline.py | 32 ++-- sklearn/svm/_base.py | 26 ++- sklearn/svm/_classes.py | 29 ++++ sklearn/svm/tests/test_svm.py | 15 ++ sklearn/tests/test_naive_bayes.py | 31 ++-- sklearn/tree/_export.py | 4 + sklearn/tree/tests/test_export.py | 13 +- 25 files changed, 380 insertions(+), 168 deletions(-) create mode 100644 .github/workflows/assign.yml create mode 100644 .github/workflows/unassign.yml diff --git a/.github/workflows/assign.yml b/.github/workflows/assign.yml new file mode 100644 index 0000000000000..72643d1bf8ae8 --- /dev/null +++ b/.github/workflows/assign.yml @@ -0,0 +1,16 @@ + +name: Assign +on: + issue_comment: + types: created + +jobs: + one: + runs-on: ubuntu-latest + steps: + - if: github.event.comment.body == 'take' + name: + run: | + echo "Assigning issue ${{ github.event.issue.number }} to ${{ github.event.comment.user.login }}" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"assignees": ["${{ github.event.comment.user.login }}"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/assignees + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X "DELETE" https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels/help%20wanted diff --git a/.github/workflows/unassign.yml b/.github/workflows/unassign.yml new file mode 100644 index 0000000000000..96f1360ba3144 --- /dev/null +++ b/.github/workflows/unassign.yml @@ -0,0 +1,14 @@ +name: Unassign +#Runs when a contributor has unassigned themselves from the issue and adds 'help wanted' and 'stalled' tags +on: + issues: + types: unassigned + +jobs: + one: + runs-on: ubuntu-latest + steps: + - name: + run: | + echo "Marking issue ${{ github.event.issue.number }} as stalled" + curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -d '{"labels": ["help wanted","Stalled"]}' https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/labels diff --git a/doc/developers/advanced_installation.rst b/doc/developers/advanced_installation.rst index 8fd0f9ecf0273..6b4b0b1141755 100644 --- a/doc/developers/advanced_installation.rst +++ b/doc/developers/advanced_installation.rst @@ -252,7 +252,9 @@ scikit-learn from source:: You can check that the custom compilers are properly installed from conda forge using the following command:: - conda list compilers llvm-openmp + conda list + +which should include ``compilers`` and ``llvm-openmp``. The compilers meta-package will automatically set custom environment variables:: diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 16adf4a607d90..d098a80ae8eec 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -181,7 +181,12 @@ Contributing code If in doubt about duplicated work, or if you want to work on a non-trivial feature, it's recommended to first open an issue in the `issue tracker `_ - to get some feedbacks from core developers. + to get some feedbacks from core developers. + + One easy way to find an issue to work on is by applying the "help wanted" + label in your search. This lists all the issues that have been unclaimed + so far. In order to claim an issue for yourself, please comment exactly + ``take`` on it for the CI to automatically assign the issue to you. How to contribute ----------------- diff --git a/doc/developers/maintainer.rst b/doc/developers/maintainer.rst index 66d5250af1644..f400989a7d877 100644 --- a/doc/developers/maintainer.rst +++ b/doc/developers/maintainer.rst @@ -1,14 +1,27 @@ Maintainer / core-developer information ======================================== + +Releasing +--------- + +This section is about preparing a major release, incrementing the minor +version, or a bug fix release incrementing the patch version. Our convention is +that we release one or more release candidates (0.RRrcN) before releasing the +final distributions. We follow the `PEP101 +`_ to indicate release candidates, +post, and minor releases. + Before a release ----------------- +................ 1. Update authors table:: $ cd build_tools; make authors; cd .. - and commit. + and commit. This is only needed if the authors have changed since the last + release. This step is sometimes done independent of the release. This + updates the maintainer list and is not the contributor list for the release. 2. Confirm any blockers tagged for the milestone are resolved, and that other issues tagged for the milestone can be postponed. @@ -17,61 +30,96 @@ Before a release change log is reasonably well curated. Some tools for these tasks include: - ``maint_tools/sort_whats_new.py`` can put what's new entries into - sections. + sections. It's not perfect, and requires manual checking of the changes. + If the whats new list is well curated, it may not be necessary. - The ``maint_tools/whats_missing.sh`` script may be used to identify pull requests that were merged but likely missing from What's New. -Preparing a bug-fix-release -........................... +4. Make sure the deprecations, FIXME and TODOs tagged for the release have + been taken care of. + +**Permissions** + +The release manager requires a set of permissions on top of the usual +permissions given to maintainers, which includes: + +- *maintainer* role on ``scikit-learn`` projects on ``pypi.org`` and + ``test.pypi.org``, separately. +- become a member of the *scikit-learn* team on conda-forge by editing the + ``recipe/meta.yaml`` file on + ``https://github.com/conda-forge/scikit-learn-feedstock`` +- *maintainer* on ``https://github.com/MacPython/scikit-learn-wheels`` + -Since any commits to a released branch (e.g. 0.999.X) will automatically update -the web site documentation, it is best to develop a bug-fix release with a pull -request in which 0.999.X is the base. It also allows you to keep track of any -tasks towards release with a TO DO list. +.. _preparing_a_release_pr: -Most development of the bug fix release, and its documentation, should -happen in master to avoid asynchrony. To select commits from master for use in -the bug fix (version 0.999.3), you can use:: +Preparing a release PR +...................... +Releasing the first RC of e.g. version `0.99` involves creating the release +branch `0.99.X` directly on the main repo, where `X` really is the letter X, +**not a placeholder**. This is considered the *feature freeze*. The +development for the major and minor releases of 0.99 should +**also** happen under `0.99.X`. Each release (rc, major, or minor) is a tag +under that branch. + +In terms of including changes, the first RC ideally counts as a *feature +freeze*. Each coming release candidate and the final release afterwards will +include minor documentation changes and bug fixes. Any major enhancement or +feature should be excluded. + +The minor releases should include bug fixes and some relevant documentation +changes only. Any PR resulting in a behavior change which is not a bug fix +should be excluded. + +First, create a branch, **on your own fork** (to release e.g. `0.999.3`):: + + $ # assuming master and upstream/master are the same $ git checkout -b release-0.999.3 master - $ git rebase -i 0.999.X -Then pick the commits for release and resolve any issues, and create a pull -request with 0.999.X as base. Add a commit updating ``sklearn.__version__``. -Additional commits can be cherry-picked into the ``release-0.999.3`` branch -while preparing the release. +Then, create a PR **to the** `scikit-learn/0.999.X` **branch** (not to +master!) with all the desired changes:: + + $ git rebase -i upstream/0.999.2 + +It's nice to have a copy of the ``git rebase -i`` log in the PR to help others +understand what's included. Making a release ----------------- +................ + +0. Create the release branch on the main repo, if it does not exist. This is + done only once, as the major and minor releases happen on the same branch:: -1. Update docs: + $ git checkout -b 0.99.X + + Again, `X` is literal here, and `99` is replaced by the release number. + The branches are called ``0.19.X``, ``0.20.X``, etc. + +1. Update docs. Note that this is for the final release, not necessarily for + the RC releases. These changes should be made in master and cherry-picked + into the release branch, only before the final release. - Edit the doc/whats_new.rst file to add release title and commit statistics. You can retrieve commit statistics with:: $ git shortlog -s 0.99.33.. | cut -f2- | sort --ignore-case | tr '\n' ';' | sed 's/;/, /g;s/, $//' - - Update the release date in whats_new.rst + - Update the release date in ``whats_new.rst`` - - Edit the doc/index.rst to change the 'News' entry of the front page. - - - Note that these changes should be made in master and cherry-picked into - the release branch. + - Edit the doc/templates/index.html to change the 'News' entry of the front + page. 2. On the branch for releasing, update the version number in - sklearn/__init__.py, the ``__version__`` variable by removing ``dev*`` only - when ready to release. - On master, increment the version in the same place (when branching for - release). - -3. Create the tag and push it:: + `sklearn/__init__.py`, the ``__version__`` variable by removing ``dev*`` + only when ready to release. On master, increment the version in the same + place (when branching for release). This means while we're in the release + candidate period, the latest stable is two versions behind the master + branch, instead of one. - $ git tag -a 0.999 - - $ git push git@github.com:scikit-learn/scikit-learn.git --tags - -4. Create the source tarball: +3. At this point all relevant PRs should have been merged into the `0.99.X` + branch. Create the source tarball: - Wipe clean your repo:: @@ -81,10 +129,32 @@ Making a release $ python setup.py sdist + - You can also test a binary dist build using:: + + $ python setup.py bdist_wheel + + - You can test if PyPi is going to accept the package using:: + + $ twine check dist/* + + You can run ``twine check`` after step 5 (fetching artifacts) as well. + The result should be in the `dist/` folder. We will upload it later with the wheels. Check that you can install it in a new virtualenv and that the tests pass. +4. Proceed with caution. Ideally, tags should be created when you're almost + certain that the release is ready, since adding a tag to the main repo can + trigger certain automated processes. You can test upload the ``sdist`` to + ``test.pypi.org``, and test the next step by setting ``BUILD_COMMIT`` to the + branch name (``0.99.X`` for instance) in a PR to the wheel building repo. + Once all works, you can proceed with tagging. Create the tag and push it (if + it's an RC, it can be ``0.xxrc1`` for instance):: + + $ git tag -a 0.99 # in the 0.99.X branch + + $ git push git@github.com:scikit-learn/scikit-learn.git 0.99 + 5. Update the dependency versions and set ``BUILD_COMMIT`` variable to the release tag at: @@ -94,16 +164,20 @@ Making a release packages and upload them to PyPI by running the following commands in the scikit-learn source folder (checked out at the release tag):: - $ rm -r dist + $ rm -r dist # only if there's anything other than the sdist tar.gz there $ pip install -U wheelhouse_uploader twine $ python setup.py fetch_artifacts 6. Check the content of the `dist/` folder: it should contain all the wheels - along with the source tarball ("scikit-learn-XXX.tar.gz"). + along with the source tarball ("scikit-learn-RRR.tar.gz"). Make sure that you do not have developer versions or older versions of the scikit-learn package in that folder. + Before uploading to pypi, you can test upload to test.pypi.org:: + + $ twine upload --verbose --repository-url https://test.pypi.org/legacy/ dist/* + Upload everything at once to https://pypi.org:: $ twine upload dist/* @@ -119,21 +193,25 @@ Making a release $ git checkout master $ rm stable $ ln -s 0.999 stable - $ sed -i "s/latestStable = '.*/latestStable = '0.999';" versionwarning.js - $ git commit -m "Update stable to point to 0.999" stable + $ sed -i "s/latestStable = '.*/latestStable = '0.999';/" versionwarning.js + $ git add stable/ versionwarning.js + $ git commit -m "Update stable to point to 0.999" $ git push origin master The following GitHub checklist might be helpful in a release PR:: * [ ] update news and what's new date in master and release branch * [ ] create tag - * [ ] update dependencies and release tag at https://github.com/MacPython/scikit-learn-wheels + * [ ] update dependencies and release tag at + https://github.com/MacPython/scikit-learn-wheels * [ ] twine the wheels to PyPI when that's green * [ ] https://github.com/scikit-learn/scikit-learn/releases draft - * [ ] confirm bot detected at https://github.com/conda-forge/scikit-learn-feedstock and wait for merge + * [ ] confirm bot detected at + https://github.com/conda-forge/scikit-learn-feedstock and wait for merge * [ ] https://github.com/scikit-learn/scikit-learn/releases publish - * [ ] announce on mailing list - * [ ] (regenerate Dash docs: https://github.com/Kapeli/Dash-User-Contributions/tree/master/docsets/Scikit) + * [ ] fix the binder release version in ``.binder/requirement.txt`` (see + #15847) + * [ ] announce on mailing list and on twitter The scikit-learn.org web site ----------------------------- diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index 457ec6c630b99..01720ba4857ab 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -229,10 +229,10 @@ It is advisable to evaluate both models, if time permits. Categorical Naive Bayes ----------------------- -:class:`CategoricalNB` implements the categorical naive Bayes -algorithm for categorically distributed data. It assumes that each feature, -which is described by the index :math:`i`, has its own categorical -distribution. +:class:`CategoricalNB` implements the categorical naive Bayes +algorithm for categorically distributed data. It assumes that each feature, +which is described by the index :math:`i`, has its own categorical +distribution. For each feature :math:`i` in the training set :math:`X`, :class:`CategoricalNB` estimates a categorical distribution for each feature i @@ -259,6 +259,13 @@ categories for each feature :math:`i` are represented with numbers :math:`0, ..., n_i - 1` where :math:`n_i` is the number of available categories of feature :math:`i`. +.. _general_naive_bayes: + +General Naive Bayes +------------------- + +[WIP] + Out-of-core naive Bayes model fitting ------------------------------------- diff --git a/doc/templates/index.html b/doc/templates/index.html index 4f69829f413e1..f897ae5f7031c 100644 --- a/doc/templates/index.html +++ b/doc/templates/index.html @@ -183,6 +183,7 @@

Community

  • Questions? See FAQ and stackoverflow
  • Mailing list: scikit-learn@python.org
  • Gitter: gitter.im/scikit-learn
  • +
  • Communication on all channels should respect PSF's code of conduct.
  • diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 21bf8a1bd3329..67f76446cbf80 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -90,6 +90,13 @@ Changelog :user:`Reshama Shaikh `, and :user:`Chiara Marmo `. +- |Fix| Changed the convention for `max_depth` parameter of + :class:`ensemble.HistGradientBoostingClassifier` and + :class:`ensemble.HistGradientBoostingRegressor`. The depth now corresponds to + the number of edges to go from the root to the deepest leaf. + Stumps (trees with one split) are now allowed. + :pr: `16182` by :user:`Santhosh B ` + :mod:`sklearn.feature_extraction` ................................. @@ -171,6 +178,13 @@ Changelog - |Efficiency| :class:`preprocessing.OneHotEncoder` is now faster at transforming. :pr:`15762` by `Thomas Fan`_. +:mod:`sklearn.svm` +.................. + +- |API| :class:`svm.SVR` and :class:`svm.OneClassSVM` attributes, `probA_` and + `probB_`, are now deprecated as they were not useful. :pr:`15558` by + `Thomas Fan`_. + :mod:`sklearn.tree` ................... diff --git a/examples/applications/wikipedia_principal_eigenvector.py b/examples/applications/wikipedia_principal_eigenvector.py index da4234936a911..097bab6c7d4d5 100644 --- a/examples/applications/wikipedia_principal_eigenvector.py +++ b/examples/applications/wikipedia_principal_eigenvector.py @@ -42,8 +42,6 @@ from scipy import sparse -from joblib import Memory - from sklearn.decomposition import randomized_svd from urllib.request import urlopen @@ -74,8 +72,6 @@ # ############################################################################# # Loading the redirect files -memory = Memory(cachedir=".") - def index(redirects, index_map, k): """Find the index of an article name after redirect resolution""" @@ -124,8 +120,6 @@ def get_redirects(redirects_filename): return redirects -# disabling joblib as the pickling of large dicts seems much too slow -#@memory.cache def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None): """Extract the adjacency graph as a scipy sparse matrix diff --git a/sklearn/cluster/_dbscan.py b/sklearn/cluster/_dbscan.py index a464e3951673a..dd1de3043d444 100644 --- a/sklearn/cluster/_dbscan.py +++ b/sklearn/cluster/_dbscan.py @@ -52,7 +52,8 @@ def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None, the options allowed by :func:`sklearn.metrics.pairwise_distances` for its metric parameter. If metric is "precomputed", X is assumed to be a distance matrix and - must be square during fit. X may be a :term:`Glossary `, + must be square during fit. + X may be a :term:`sparse graph `, in which case only "nonzero" elements may be considered neighbors. metric_params : dict, optional diff --git a/sklearn/ensemble/_bagging.py b/sklearn/ensemble/_bagging.py index ea4e5eedb6079..b1ae443e78bf1 100644 --- a/sklearn/ensemble/_bagging.py +++ b/sklearn/ensemble/_bagging.py @@ -464,13 +464,16 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): The number of base estimators in the ensemble. max_samples : int or float, default=1.0 - The number of samples to draw from X to train each base estimator. + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. max_features : int or float, default=1.0 - The number of features to draw from X to train each base estimator. + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. @@ -501,10 +504,12 @@ class BaggingClassifier(ClassifierMixin, BaseBagging): processors. See :term:`Glossary ` for more details. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. verbose : int, default=0 Controls the verbosity when fitting and predicting. @@ -866,13 +871,16 @@ class BaggingRegressor(RegressorMixin, BaseBagging): The number of base estimators in the ensemble. max_samples : int or float, default=1.0 - The number of samples to draw from X to train each base estimator. + The number of samples to draw from X to train each base estimator (with + replacement by default, see `bootstrap` for more details). - If int, then draw `max_samples` samples. - If float, then draw `max_samples * X.shape[0]` samples. max_features : int or float, default=1.0 - The number of features to draw from X to train each base estimator. + The number of features to draw from X to train each base estimator ( + without replacement by default, see `bootstrap_features` for more + details). - If int, then draw `max_features` features. - If float, then draw `max_features * X.shape[1]` features. @@ -900,10 +908,12 @@ class BaggingRegressor(RegressorMixin, BaseBagging): processors. See :term:`Glossary ` for more details. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the random resampling of the original dataset + (sample wise and feature wise). + If the base estimator accepts a `random_state` attribute, a different + seed is generated for each instance in the ensemble. + Pass an int for reproducible output across multiple function calls. + See :term:`Glossary `. verbose : int, default=0 Controls the verbosity when fitting and predicting. diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py index cb708ecc576e7..f3efd3c897a4c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py @@ -689,8 +689,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting): than 1. If None, there is no maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. Must be strictly greater - than 1. Depth isn't constrained by default. + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value @@ -872,8 +872,8 @@ class HistGradientBoostingClassifier(BaseHistGradientBoosting, than 1. If None, there is no maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. Must be strictly greater - than 1. Depth isn't constrained by default. + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index c7d303b8f6201..cd0a4ed1cb34c 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -135,7 +135,8 @@ class TreeGrower: maximum limit. max_depth : int or None, optional (default=None) The maximum depth of each tree. The depth of a tree is the number of - nodes to go from the root to the deepest leaf. + edges to go from the root to the deepest leaf. + Depth isn't constrained by default. min_samples_leaf : int, optional (default=20) The minimum number of samples per leaf. min_gain_to_split : float, optional (default=0.) @@ -230,9 +231,9 @@ def _validate_parameters(self, X_binned, max_leaf_nodes, max_depth, if max_leaf_nodes is not None and max_leaf_nodes <= 1: raise ValueError('max_leaf_nodes={} should not be' ' smaller than 2'.format(max_leaf_nodes)) - if max_depth is not None and max_depth <= 1: + if max_depth is not None and max_depth < 1: raise ValueError('max_depth={} should not be' - ' smaller than 2'.format(max_depth)) + ' smaller than 1'.format(max_depth)) if min_samples_leaf < 1: raise ValueError('min_samples_leaf={} should ' 'not be smaller than 1'.format(min_samples_leaf)) diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py index 87950eab38a97..b607cdd23b6c9 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py @@ -31,8 +31,7 @@ ({'max_iter': 0}, 'max_iter=0 must not be smaller than 1'), ({'max_leaf_nodes': 0}, 'max_leaf_nodes=0 should not be smaller than 2'), ({'max_leaf_nodes': 1}, 'max_leaf_nodes=1 should not be smaller than 2'), - ({'max_depth': 0}, 'max_depth=0 should not be smaller than 2'), - ({'max_depth': 1}, 'max_depth=1 should not be smaller than 2'), + ({'max_depth': 0}, 'max_depth=0 should not be smaller than 1'), ({'min_samples_leaf': 0}, 'min_samples_leaf=0 should not be smaller'), ({'l2_regularization': -1}, 'l2_regularization=-1 must be positive'), ({'max_bins': 1}, 'max_bins=1 should be no smaller than 2 and no larger'), diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py index 0cc301b7b1b36..d770b50e7aa30 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/tests/test_grower.py @@ -257,7 +257,14 @@ def test_min_samples_leaf_root(n_samples, min_samples_leaf): assert len(grower.finalized_leaves) == 1 -@pytest.mark.parametrize('max_depth', [2, 3]) +def assert_is_stump(grower): + # To assert that stumps are created when max_depth=1 + for leaf in (grower.root.left_child, grower.root.right_child): + assert leaf.left_child is None + assert leaf.right_child is None + + +@pytest.mark.parametrize('max_depth', [1, 2, 3]) def test_max_depth(max_depth): # Make sure max_depth parameter works as expected rng = np.random.RandomState(seed=0) @@ -279,6 +286,9 @@ def test_max_depth(max_depth): depth = max(leaf.depth for leaf in grower.finalized_leaves) assert depth == max_depth + if max_depth == 1: + assert_is_stump(grower) + def test_input_validation(): diff --git a/sklearn/ensemble/_iforest.py b/sklearn/ensemble/_iforest.py index df393b628bb02..d91052b27759f 100644 --- a/sklearn/ensemble/_iforest.py +++ b/sklearn/ensemble/_iforest.py @@ -107,10 +107,11 @@ class IsolationForest(OutlierMixin, BaseBagging): 0.24. random_state : int, RandomState instance, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + Controls the pseudo-randomness of the selection of the feature + and split values for each branching step and each tree in the forest. + + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. verbose : int, default=0 Controls the verbosity of the tree building process. diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index db6e04c79bb0d..39b3b8602fefa 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -118,17 +118,12 @@ def predict_proba(self, X): class GeneralNB(_BaseNB): """General Naive Bayes - Parameters - ---------- - distributions : list of tuples - A list of (NB, features) tuples, where NB is 'BernoulliNB', 'GaussianNB', - 'MultinomialNB', 'ComplementNB' or 'CategoricalNB', and features is - a list of indices. + Read more in the :ref:`User Guide `. Attributes ---------- fits_ : list of objects - list of objects that inherit from BaseNB + list of fitted classifiers Examples -------- @@ -138,24 +133,20 @@ class GeneralNB(_BaseNB): ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB - >>> clf = GeneralNB([(GaussianNB(), [0, 1, 2]), - ... (BernoulliNB(), [3, 4])]) - >>> clf.fit(X, y) - GeneralNB(distributions=[(GaussianNB(), [0, 1, 2]), (BernoulliNB(), [3, 4])]) + >>> clf = GeneralNB() + >>> clf.fit(X, y, [(GaussianNB(), [0, 1, 2]), + ... (BernoulliNB(), [3, 4])]) + GeneralNB() >>> print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) [1] >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) 1.0 """ - def __init__(self, distributions): - self.distributions_ = distributions - self.fits = [] + def __init__(self): + self.fits_ = [] - def __repr__(self): - return f"{str(self.__class__.__name__)}(distributions={self.distributions_})" - - def fit(self, X, y): + def fit(self, X, y, distributions): """Fit Gaussian Naive Bayes according to X, y Parameters @@ -165,26 +156,25 @@ def fit(self, X, y): and n_features is the number of features. y : array-like, shape (n_samples,) Target values. - sample_weight : array-like, shape (n_samples,), optional (default=None) - Weights applied to individual samples (1. for unweighted). - fits : list of (NB, feature) tuples - List of fitted NBs + distributions : list of tuples + A list of (NB, features) tuples, where NB is 'BernoulliNB', 'GaussianNB', + 'MultinomialNB', 'ComplementNB' or 'CategoricalNB', and features is + a list of indices. Returns ------- self : object """ - self._check_distributions(self.distributions_, X) + self._check_distributions(distributions, X) X, y = check_X_y(X, y) y = column_or_1d(y, warn=True) - # FIXME aggregate all classes and all priors? self.classes_ = np.unique(y) - inits = [(nb, features) for (nb, features) in self.distributions_] + inits = [(nb, features) for (nb, features) in distributions] - self.fits = [(nb.fit(X[:, features], y), features) + self.fits_ = [(nb.fit(X[:, features], y), features) for (nb, features) in inits] return self @@ -198,11 +188,11 @@ def _joint_log_likelihood(self, X): # So we'll take the first one. log_priors = [nb.class_log_prior_ if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) - for (nb, _) in self.fits] + for (nb, _) in self.fits_] log_prior = log_priors[0] jlls = [nb._joint_log_likelihood(X[:, features]) - for (nb, features) in self.fits] + for (nb, features) in self.fits_] # jlls has the shape (distribution, sample, class) jlls = np.hstack([jlls]) @@ -261,11 +251,9 @@ def _check_distributions(self, distributions, X): num_cols = len(dict_distribution) if num_cols != num_cols_expected: - raise ValueError("Expected {} features ".format(num_cols_expected) + + raise ValueError("Expected {} features".format(num_cols_expected) + " to have specified distributions " + - "but only {} were specified.".format(num_cols)) - - # Check inefficient specification? + "but {} were specified.".format(num_cols)) class GaussianNB(_BaseNB): @@ -720,8 +708,9 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): # We convert it to np.float64 to support sample_weight consistently Y = Y.astype(np.float64, copy=False) if sample_weight is not None: + sample_weight = _check_sample_weight(sample_weight, X) sample_weight = np.atleast_2d(sample_weight) - Y *= check_array(sample_weight).T + Y *= sample_weight.T class_prior = self.class_prior @@ -772,9 +761,9 @@ def fit(self, X, y, sample_weight=None): # this means we also don't have to cast X to floating point if sample_weight is not None: Y = Y.astype(np.float64, copy=False) - sample_weight = np.asarray(sample_weight) + sample_weight = _check_sample_weight(sample_weight, X) sample_weight = np.atleast_2d(sample_weight) - Y *= check_array(sample_weight).T + Y *= sample_weight.T class_prior = self.class_prior diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py index 9cc66bedb46ce..90f9210db5d6b 100644 --- a/sklearn/neural_network/_multilayer_perceptron.py +++ b/sklearn/neural_network/_multilayer_perceptron.py @@ -793,11 +793,12 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron): Whether to shuffle samples in each iteration. Only used when solver='sgd' or 'adam'. - random_state : int, RandomState instance or None, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Determines random number generation for weights and bias + initialization, train-test split if early stopping is used, and batch + sampling when solver='sgd' or 'adam'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. tol : float, default=1e-4 Tolerance for the optimization. When the loss or score is not improving @@ -1185,11 +1186,12 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron): Whether to shuffle samples in each iteration. Only used when solver='sgd' or 'adam'. - random_state : int, RandomState instance or None, default=None - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. + random_state : int, RandomState instance, default=None + Determines random number generation for weights and bias + initialization, train-test split if early stopping is used, and batch + sampling when solver='sgd' or 'adam'. + Pass an int for reproducible results across multiple function calls. + See :term:`Glossary `. tol : float, default=1e-4 Tolerance for the optimization. When the loss or score is not improving diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index af2feed1a861e..eb42c43a98905 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -56,7 +56,7 @@ class Pipeline(_BaseComposition): chained, in the order in which they are chained, with the last object an estimator. - memory : None, str or object with the joblib.Memory interface, optional + memory : str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of @@ -140,7 +140,7 @@ def get_params(self, deep=True): Parameters ---------- - deep : boolean, optional + deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. @@ -502,7 +502,7 @@ def score_samples(self, X): Returns ------- - y_score : ndarray, shape (n_samples,) + y_score : ndarray of shape (n_samples,) """ Xt = X for _, _, transformer in self._iter(with_final=False): @@ -664,7 +664,7 @@ def make_pipeline(*steps, **kwargs): ---------- *steps : list of estimators. - memory : None, str or object with the joblib.Memory interface, optional + memory : str or object with the joblib.Memory interface, default=None Used to cache the fitted transformers of the pipeline. By default, no caching is performed. If a string is given, it is the path to the caching directory. Enabling caching triggers a clone of @@ -674,7 +674,7 @@ def make_pipeline(*steps, **kwargs): inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. - verbose : boolean, default=False + verbose : bool, default=False If True, the time elapsed while fitting each step will be printed as it is completed. @@ -773,17 +773,17 @@ class FeatureUnion(TransformerMixin, _BaseComposition): .. versionchanged:: 0.22 Deprecated `None` as a transformer in favor of 'drop'. - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - transformer_weights : dict, optional + transformer_weights : dict, default=None Multiplicative weights for features per transformer. Keys are transformer names, values the weights. - verbose : boolean, optional(default=False) + verbose : bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed. @@ -818,7 +818,7 @@ def get_params(self, deep=True): Parameters ---------- - deep : boolean, optional + deep : bool, default=True If True, will return the parameters for this estimator and contained subobjects that are estimators. @@ -900,7 +900,7 @@ def fit(self, X, y=None, **fit_params): X : iterable or array-like, depending on transformers Input data, used to fit transformers. - y : array-like, shape (n_samples, ...), optional + y : array-like of shape (n_samples, n_outputs), default=None Targets for supervised learning. Returns @@ -924,12 +924,13 @@ def fit_transform(self, X, y=None, **fit_params): X : iterable or array-like, depending on transformers Input data to be transformed. - y : array-like, shape (n_samples, ...), optional + y : array-like of shape (n_samples, n_outputs), default=None Targets for supervised learning. Returns ------- - X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) + X_t : array-like or sparse matrix of \ + shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ @@ -975,7 +976,8 @@ def transform(self, X): Returns ------- - X_t : array-like or sparse matrix, shape (n_samples, sum_n_components) + X_t : array-like or sparse matrix of \ + shape (n_samples, sum_n_components) hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ @@ -1010,13 +1012,13 @@ def make_union(*transformers, **kwargs): ---------- *transformers : list of estimators - n_jobs : int or None, optional (default=None) + n_jobs : int, default=None Number of jobs to run in parallel. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - verbose : boolean, optional(default=False) + verbose : bool, default=False If True, the time elapsed while fitting each transformer will be printed as it is completed. diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 43dc8b428e4b3..ea9b52ba4ef89 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -246,8 +246,8 @@ def _dense_fit(self, X, y, sample_weight, solver_type, kernel, # we don't pass **self.get_params() to allow subclasses to # add other parameters to __init__ self.support_, self.support_vectors_, self._n_support, \ - self.dual_coef_, self.intercept_, self.probA_, \ - self.probB_, self.fit_status_ = libsvm.fit( + self.dual_coef_, self.intercept_, self._probA, \ + self._probB, self.fit_status_ = libsvm.fit( X, y, svm_type=solver_type, sample_weight=sample_weight, class_weight=self.class_weight_, kernel=kernel, C=self.C, @@ -270,7 +270,7 @@ def _sparse_fit(self, X, y, sample_weight, solver_type, kernel, self.support_, self.support_vectors_, dual_coef_data, \ self.intercept_, self._n_support, \ - self.probA_, self.probB_, self.fit_status_ = \ + self._probA, self._probB, self.fit_status_ = \ libsvm_sparse.libsvm_sparse_train( X.shape[1], X.data, X.indices, X.indptr, y, solver_type, kernel_type, self.degree, self._gamma, self.coef0, self.tol, @@ -334,7 +334,7 @@ def _dense_predict(self, X): return libsvm.predict( X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, - self.probA_, self.probB_, svm_type=svm_type, kernel=kernel, + self._probA, self._probB, svm_type=svm_type, kernel=kernel, degree=self.degree, coef0=self.coef0, gamma=self._gamma, cache_size=self.cache_size) @@ -359,7 +359,7 @@ def _sparse_predict(self, X): C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, - self.probA_, self.probB_) + self._probA, self._probB) def _compute_kernel(self, X): """Return the data transformed by a callable kernel""" @@ -413,7 +413,7 @@ def _dense_decision_function(self, X): return libsvm.decision_function( X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, - self.probA_, self.probB_, + self._probA, self._probB, svm_type=LIBSVM_IMPL.index(self._impl), kernel=kernel, degree=self.degree, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma) @@ -438,7 +438,7 @@ def _sparse_decision_function(self, X): self.C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, - self.probA_, self.probB_) + self._probA, self._probB) def _validate_for_predict(self, X): check_is_fitted(self) @@ -691,7 +691,7 @@ def _dense_predict_proba(self, X): pprob = libsvm.predict_proba( X, self.support_, self.support_vectors_, self._n_support, self._dual_coef_, self._intercept_, - self.probA_, self.probB_, + self._probA, self._probB, svm_type=svm_type, kernel=kernel, degree=self.degree, cache_size=self.cache_size, coef0=self.coef0, gamma=self._gamma) @@ -717,7 +717,7 @@ def _sparse_predict_proba(self, X): self.C, self.class_weight_, self.nu, self.epsilon, self.shrinking, self.probability, self._n_support, - self.probA_, self.probB_) + self._probA, self._probB) def _get_coef(self): if self.dual_coef_.shape[0] == 1: @@ -734,6 +734,14 @@ def _get_coef(self): return coef + @property + def probA_(self): + return self._probA + + @property + def probB_(self): + return self._probB + def _get_liblinear_solver_type(multi_class, penalty, loss, dual): """Find the liblinear magic number for the solver. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 698acb6ae68b3..d21e8523cac2c 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -8,6 +8,7 @@ from ..utils import check_X_y from ..utils.validation import _num_samples from ..utils.multiclass import check_classification_targets +from ..utils.deprecation import deprecated class LinearSVC(BaseEstimator, LinearClassifierMixin, @@ -968,6 +969,20 @@ def __init__(self, kernel='rbf', degree=3, gamma='scale', shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, max_iter=max_iter, random_state=None) + @deprecated( + "The probA_ attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.") + @property + def probA_(self): + return self._probA + + @deprecated( + "The probB_ attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.") + @property + def probB_(self): + return self._probB + class NuSVR(RegressorMixin, BaseLibSVM): """Nu Support Vector Regression. @@ -1287,3 +1302,17 @@ def predict(self, X): """ y = super().predict(X) return np.asarray(y, dtype=np.intp) + + @deprecated( + "The probA_ attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.") + @property + def probA_(self): + return self._probA + + @deprecated( + "The probB_ attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.") + @property + def probB_(self): + return self._probB diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index d789be7f26383..4360c818e0bd7 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -1233,3 +1233,18 @@ def test_n_support_oneclass_svr(): assert reg.n_support_ == reg.support_vectors_.shape[0] assert reg.n_support_.size == 1 assert reg.n_support_ == 4 + + +# TODO: Remove in 0.25 when probA_ and probB_ are deprecated +@pytest.mark.parametrize("SVMClass, data", [ + (svm.OneClassSVM, (X, )), + (svm.SVR, (X, Y)) +]) +@pytest.mark.parametrize("deprecated_prob", ["probA_", "probB_"]) +def test_svm_probA_proB_deprecated(SVMClass, data, deprecated_prob): + clf = SVMClass().fit(*data) + + msg = ("The {} attribute is deprecated in version 0.23 and will be " + "removed in version 0.25.").format(deprecated_prob) + with pytest.warns(FutureWarning, match=msg): + getattr(clf, deprecated_prob) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 47b96653aa8af..af4f2549ea6a5 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -44,41 +44,42 @@ def test_generalnb_correctness(): [2.7, 3.8, 2.3, 1, 0], [1.7, 0.1, 4.5, 1, 0]]) y = np.array([1, 0, 0]) - clf = GeneralNB([(GaussianNB(), [0, 1, 2]), - (BernoulliNB(), [3, 4])]) - clf.fit(X, y) + clf = GeneralNB() + clf.fit(X, y, [(GaussianNB(), [0, 1, 2]), + (BernoulliNB(), [3, 4])]) print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) print(clf.score([[2.7, 3.8, 1, 0, 1]], [0])) def test_generalnb_input_distributions_not_enough(): - clf = GeneralNB([(GaussianNB(), [0])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [(GaussianNB(), [0])]) def test_generalnb_input_distributions_duplicate(): - clf = GeneralNB([(GaussianNB(), [0, 1]), (GaussianNB(), [1])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [ + (GaussianNB(), [0, 1]), (GaussianNB(), [1])]) def test_generalnb_input_distributions_unknown_distr(): - clf = GeneralNB([(GeneralNB((GaussianNB(), [0, 1])), [0, 1])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [(GeneralNB(),[0,1])]) def test_generalnb_wrong_type(): - clf = GeneralNB([[GaussianNB(), [0, 1]]]) - assert_raises(TypeError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(TypeError, clf.fit, X, y, [[GaussianNB(), [0, 1]]]) def test_generalnb_tuple_too_long(): - clf = GeneralNB([(GaussianNB(), [0, 1], [3])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [(GaussianNB(), [0, 1], [3])]) def test_generalnb_wrong_format(): - clf = GeneralNB([(GaussianNB, [0, 1])]) - assert_raises(ValueError, clf.fit, X, y) + clf = GeneralNB() + assert_raises(ValueError, clf.fit, X, y, [(GaussianNB, [0, 1])]) def test_pickle(): diff --git a/sklearn/tree/_export.py b/sklearn/tree/_export.py index 212ae4e309749..3197995818f81 100644 --- a/sklearn/tree/_export.py +++ b/sklearn/tree/_export.py @@ -26,6 +26,7 @@ import warnings + def _color_brew(n): """Generate n colors with equally spaced hues. @@ -174,6 +175,8 @@ def plot_tree(decision_tree, max_depth=None, feature_names=None, """ + check_is_fitted(decision_tree) + if rotate != 'deprecated': warnings.warn(("'rotate' has no effect and is deprecated in 0.23. " "It will be removed in 0.25."), @@ -571,6 +574,7 @@ def _make_tree(self, node_id, et, criterion, depth=0): def export(self, decision_tree, ax=None): import matplotlib.pyplot as plt from matplotlib.text import Annotation + if ax is None: ax = plt.gca() ax.clear() diff --git a/sklearn/tree/tests/test_export.py b/sklearn/tree/tests/test_export.py index f1c080dea4d2a..ad49f81fcf9ac 100644 --- a/sklearn/tree/tests/test_export.py +++ b/sklearn/tree/tests/test_export.py @@ -303,11 +303,11 @@ def test_precision(): # check impurity for finding in finditer(pattern, dot_data): assert (len(search(r"\.\d+", finding.group()).group()) == - precision + 1) + precision + 1) # check threshold for finding in finditer(r"<= \d+\.\d+", dot_data): assert (len(search(r"\.\d+", finding.group()).group()) == - precision + 1) + precision + 1) def test_export_text_errors(): @@ -459,3 +459,12 @@ def test_plot_tree_rotate_deprecation(pyplot): "It will be removed in 0.25.") with pytest.warns(FutureWarning, match=match): plot_tree(tree, rotate=True) + + +def test_not_fitted_tree(pyplot): + + # Testing if not fitted tree throws the correct error + clf = DecisionTreeRegressor() + out = StringIO() + with pytest.raises(NotFittedError): + plot_tree(clf, out) From 09d119e5d3f4a315d19dd69645ac3803c0798d72 Mon Sep 17 00:00:00 2001 From: "Raimi Karim (B4D2D7F7)" Date: Wed, 29 Jan 2020 22:17:46 +0800 Subject: [PATCH 09/61] Update function names --- sklearn/tests/test_naive_bayes.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index af4f2549ea6a5..665a25fd19aff 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -51,33 +51,33 @@ def test_generalnb_correctness(): print(clf.score([[2.7, 3.8, 1, 0, 1]], [0])) -def test_generalnb_input_distributions_not_enough(): +def test_generalnb_distributions_insufficient(): clf = GeneralNB() assert_raises(ValueError, clf.fit, X, y, [(GaussianNB(), [0])]) -def test_generalnb_input_distributions_duplicate(): +def test_generalnb_distributions_duplicate(): clf = GeneralNB() assert_raises(ValueError, clf.fit, X, y, [ (GaussianNB(), [0, 1]), (GaussianNB(), [1])]) -def test_generalnb_input_distributions_unknown_distr(): +def test_generalnb_distributions_unknown(): clf = GeneralNB() assert_raises(ValueError, clf.fit, X, y, [(GeneralNB(),[0,1])]) -def test_generalnb_wrong_type(): +def test_generalnb_distributions_wrong_type(): clf = GeneralNB() assert_raises(TypeError, clf.fit, X, y, [[GaussianNB(), [0, 1]]]) -def test_generalnb_tuple_too_long(): +def test_generalnb_distributions_tuple_too_long(): clf = GeneralNB() assert_raises(ValueError, clf.fit, X, y, [(GaussianNB(), [0, 1], [3])]) -def test_generalnb_wrong_format(): +def test_generalnb_distributions_wrong_format(): clf = GeneralNB() assert_raises(ValueError, clf.fit, X, y, [(GaussianNB, [0, 1])]) From e01984caa58f04058299e28e374a6b9262685d30 Mon Sep 17 00:00:00 2001 From: "Raimi Karim (B4D2D7F7)" Date: Wed, 29 Jan 2020 22:37:54 +0800 Subject: [PATCH 10/61] Fixed formatting --- sklearn/naive_bayes.py | 17 +++++++++-------- sklearn/tests/test_naive_bayes.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 39b3b8602fefa..7ba85b033cdc2 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -157,9 +157,9 @@ def fit(self, X, y, distributions): y : array-like, shape (n_samples,) Target values. distributions : list of tuples - A list of (NB, features) tuples, where NB is 'BernoulliNB', 'GaussianNB', - 'MultinomialNB', 'ComplementNB' or 'CategoricalNB', and features is - a list of indices. + A list of (NB, features) tuples, where NB is 'BernoulliNB', + 'GaussianNB', 'MultinomialNB', 'ComplementNB' or 'CategoricalNB', + and features is a list of indices. Returns ------- @@ -175,7 +175,7 @@ def fit(self, X, y, distributions): inits = [(nb, features) for (nb, features) in distributions] self.fits_ = [(nb.fit(X[:, features], y), features) - for (nb, features) in inits] + for (nb, features) in inits] return self @@ -186,9 +186,10 @@ def _joint_log_likelihood(self, X): # For now assume all class log priors are the same for all the NB's # So we'll take the first one. - log_priors = [nb.class_log_prior_ - if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) - for (nb, _) in self.fits_] + log_priors = [ + nb.class_log_prior_ + if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) + for (nb, _) in self.fits_] log_prior = log_priors[0] jlls = [nb._joint_log_likelihood(X[:, features]) @@ -233,7 +234,7 @@ def _check_distributions(self, distributions, X): if len(distribution) != 2: raise ValueError("Expected tuple to have length of 2 " + - "but got {}".format(len(distribution))) + "but got {}".format(len(distribution))) nb, features = distribution diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 665a25fd19aff..0f897641204fc 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -64,7 +64,7 @@ def test_generalnb_distributions_duplicate(): def test_generalnb_distributions_unknown(): clf = GeneralNB() - assert_raises(ValueError, clf.fit, X, y, [(GeneralNB(),[0,1])]) + assert_raises(ValueError, clf.fit, X, y, [(GeneralNB(), [0, 1])]) def test_generalnb_distributions_wrong_type(): From 80595b798aa2245134a0d7f28bddf1824a45e499 Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 4 Feb 2020 22:47:49 +0800 Subject: [PATCH 11/61] [WIP] ColumnTransformer-like API --- sklearn/naive_bayes.py | 91 +++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 23 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 7ba85b033cdc2..68d565a2632d2 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -24,12 +24,13 @@ from .preprocessing import binarize from .preprocessing import LabelBinarizer from .preprocessing import label_binarize -from .utils import check_X_y, check_array, deprecated from .utils.extmath import safe_sparse_dot +from .utils import check_X_y, check_array, deprecated, Bunch from .utils.fixes import logsumexp +from .utils.validation import _check_sample_weight +from .utils.metaestimators import _BaseComposition from .utils.multiclass import _check_partial_fit_first_call from .utils.validation import check_is_fitted, check_non_negative, column_or_1d -from .utils.validation import _check_sample_weight __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB', 'CategoricalNB', 'GeneralNB'] @@ -115,7 +116,7 @@ def predict_proba(self, X): return np.exp(self.predict_log_proba(X)) -class GeneralNB(_BaseNB): +class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): """General Naive Bayes Read more in the :ref:`User Guide `. @@ -128,25 +129,47 @@ class GeneralNB(_BaseNB): Examples -------- >>> import numpy as np + >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], ... [2.7, 3.8, 2.3, 1, 0], ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) - >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB - >>> clf = GeneralNB() - >>> clf.fit(X, y, [(GaussianNB(), [0, 1, 2]), - ... (BernoulliNB(), [3, 4])]) - GeneralNB() + >>> clf = GeneralNB([ + >>> ("gaussian", GaussianNB(), [0, 1, 2]), + >>> ("bernoulli", BernoulliNB(), [3, 4]) + >>> ]) + >>> clf.fit(X, y) >>> print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) [1] >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) 1.0 """ - def __init__(self): + def __init__(self, distributions): + self.distributions = distributions self.fits_ = [] - def fit(self, X, y, distributions): + @property + def _distributions(self): + return [(name, distr) for name, distr, _ in self.distributions] + + # TODO doesn't seem right + @_distributions.setter + def _distributions(self, value): + print(self.distributions) + print(list(zip(value, self.distributions))) + self.distributions = [ + (name, distr, col) for ((name, distr), (_, _, col)) + in zip(value, self.distributions)] + + def get_params(self, deep=True): + return self._get_params('_distributions', deep=deep) + + def set_params(self, **kwargs): + self._set_params('_distributions', **kwargs) + return self + + def fit(self, X, y): """Fit Gaussian Naive Bayes according to X, y Parameters @@ -165,14 +188,14 @@ def fit(self, X, y, distributions): ------- self : object """ - self._check_distributions(distributions, X) + self._validate_distributions() X, y = check_X_y(X, y) y = column_or_1d(y, warn=True) # FIXME aggregate all classes and all priors? self.classes_ = np.unique(y) - inits = [(nb, features) for (nb, features) in distributions] + inits = [(nb, features) for (nb, features) in self.distributions] self.fits_ = [(nb.fit(X[:, features], y), features) for (nb, features) in inits] @@ -189,11 +212,11 @@ def _joint_log_likelihood(self, X): log_priors = [ nb.class_log_prior_ if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) - for (nb, _) in self.fits_] + for (nb, _) in self._fits] log_prior = log_priors[0] jlls = [nb._joint_log_likelihood(X[:, features]) - for (nb, features) in self.fits_] + for (nb, features) in self._fits] # jlls has the shape (distribution, sample, class) jlls = np.hstack([jlls]) @@ -205,10 +228,7 @@ def _joint_log_likelihood(self, X): return jll - def _check_X(self, X): - return check_array(X) - - def _check_distributions(self, distributions, X): + def _validate_distributions(self): """Check validity of distributions Distributions should be explicitly specified @@ -221,19 +241,24 @@ def _check_distributions(self, distributions, X): num_cols_expected = X.shape[-1] # Check type - if not isinstance(distributions, list): + if not isinstance(self.distributions, list): raise TypeError( - "Expected list but got {}".format(type(distributions))) + "Expected list but got {}".format(type(self.distributions))) + + names, distributions, _ = zip(*self.distributions) + self._validate_names(names) + + def _validate_nb_callables(self): # Check if all are sklearn classes - for distribution in distributions: + for distribution in self.distributions: if not isinstance(distribution, tuple): raise TypeError( "Expected tuple but got {}".format(type(distribution))) if len(distribution) != 2: - raise ValueError("Expected tuple to have length of 2 " + + raise ValueError("Expected tuple to have length of 2 " "but got {}".format(len(distribution))) nb, features = distribution @@ -253,9 +278,29 @@ def _check_distributions(self, distributions, X): num_cols = len(dict_distribution) if num_cols != num_cols_expected: raise ValueError("Expected {} features".format(num_cols_expected) + - " to have specified distributions " + + " to have specified distributions " "but {} were specified.".format(num_cols)) + def _validate_remainder(self): + pass + + def _check_X(self, X): + return check_array(X) + + @property + def named_distributions_(self): + """Access the fitted transformer by name. + + Read-only attribute to access any transformer by given name. + Keys are transformer names and values are the fitted transformer + objects. + + """ + # Use Bunch object to improve autocomplete + return Bunch(**{name: trans for name, trans, _ + in self.distributions}) + + class GaussianNB(_BaseNB): """ From de36f45daba7863be845603fc3782f7512e5d7c3 Mon Sep 17 00:00:00 2001 From: raibosome Date: Wed, 5 Feb 2020 22:25:42 +0800 Subject: [PATCH 12/61] Update generalnb --- sklearn/naive_bayes.py | 103 +++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 41 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 68d565a2632d2..06803f4dfeb7d 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -13,6 +13,7 @@ # Lars Buitinck # Jan Hendrik Metzen # (parts based on earlier work by Mathieu Blondel) +# Raimi Karim # # License: BSD 3 clause import warnings @@ -123,7 +124,8 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): Attributes ---------- - fits_ : list of objects + # TODO + distributions : list of objects list of fitted classifiers Examples @@ -144,18 +146,19 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) 1.0 """ + # TODO check_is_fitted def __init__(self, distributions): self.distributions = distributions - self.fits_ = [] + self._fits = [] @property def _distributions(self): return [(name, distr) for name, distr, _ in self.distributions] - # TODO doesn't seem right @_distributions.setter def _distributions(self, value): + # TODO wrong print(self.distributions) print(list(zip(value, self.distributions))) self.distributions = [ @@ -188,17 +191,17 @@ def fit(self, X, y): ------- self : object """ - self._validate_distributions() + self._validate_distributions(X) X, y = check_X_y(X, y) y = column_or_1d(y, warn=True) # FIXME aggregate all classes and all priors? self.classes_ = np.unique(y) - inits = [(nb, features) for (nb, features) in self.distributions] + inits = [(_, nb, features) for (_, nb, features) in self.distributions] self.fits_ = [(nb.fit(X[:, features], y), features) - for (nb, features) in inits] + for (_, nb, features) in inits] return self @@ -207,18 +210,19 @@ def _joint_log_likelihood(self, X): X = np.array(X) - # For now assume all class log priors are the same for all the NB's - # So we'll take the first one. log_priors = [ nb.class_log_prior_ if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) for (nb, _) in self._fits] + + # Assume all class log priors are the same for all the NB's + # so we'll take the first one. log_prior = log_priors[0] jlls = [nb._joint_log_likelihood(X[:, features]) - for (nb, features) in self._fits] + for (_, nb, features) in self._fits] - # jlls has the shape (distribution, sample, class) + # jlls have the shape (distribution, sample, class) jlls = np.hstack([jlls]) # Remove the class log prior from all the distributions @@ -228,70 +232,87 @@ def _joint_log_likelihood(self, X): return jll - def _validate_distributions(self): - """Check validity of distributions + def _validate_distributions(self, X): - Distributions should be explicitly specified - """ valid_modules = copy.copy(__all__) valid_modules.remove("GeneralNB") - dict_distribution = {} + self._dict_distribution = {} - X = np.array(X) - num_cols_expected = X.shape[-1] + _list_fit_prior = [] + _list_class_prior = [] + + names, _, _ = zip(*self.distributions) + self._validate_names(names) # Check type if not isinstance(self.distributions, list): raise TypeError( "Expected list but got {}".format(type(self.distributions))) - names, distributions, _ = zip(*self.distributions) - self._validate_names(names) - - def _validate_nb_callables(self): - - # Check if all are sklearn classes for distribution in self.distributions: + # Check type if not isinstance(distribution, tuple): raise TypeError( - "Expected tuple but got {}".format(type(distribution))) - - if len(distribution) != 2: - raise ValueError("Expected tuple to have length of 2 " + "Expected list of tuples " + "but got list of {}s".format(type(distribution))) + if len(distribution) != 3: + raise ValueError("Expected tuple to have length of 3 " "but got {}".format(len(distribution))) - nb, features = distribution + _, model, features = distribution - if callable(nb): + # Check naive bayes model + if callable(model): raise ValueError("Wrong format specified.") - if nb.__class__.__name__ not in valid_modules: + if not (hasattr(model, "fit") or hasattr(model, "_joint_log_likelihood")): + raise TypeError("Naive bayes model should implement " + "the fit and _joint_log_likelihood methods. " + "{} doesn't.".format(type(model))) + if model.__class__.__name__ not in valid_modules: raise ValueError( "Distributions should be one of {}".format(valid_modules)) + + # For checking fit_prior later + _class_prior = getattr(model, "prior", None) or getattr(model, "class_prior", None) + _list_class_prior.append(_class_prior) + + _fit_prior = getattr(model, "fit_prior", True) + _list_fit_prior.append(_fit_prior) + + # Check the feature for feature in features: - if feature in dict_distribution: - raise ValueError( - "Duplicate specification of feature found.") + if feature in self._dict_distribution: + raise ValueError("Duplicate specification of feature found.") else: - dict_distribution[feature] = nb.__class__.__name__.lower() + self._dict_distribution[feature] = model.__class__.__name__.lower() - num_cols = len(dict_distribution) + if len(set(_list_class_prior)) != 1: + raise ValueError("The parameters 'class_prior' or 'prior' " + "must have the same values through out all models " + "if specified.") + + if len(set(_list_fit_prior)) != 1: + raise ValueError("The parameter 'fit_prior' " + "must have the same values through out all models " + "if specified.") + + X = np.array(X) + num_cols_expected = X.shape[-1] + num_cols = len(self._dict_distribution) if num_cols != num_cols_expected: - raise ValueError("Expected {} features".format(num_cols_expected) + - " to have specified distributions " + raise ValueError("Expected {} columns".format(num_cols_expected) + "but {} were specified.".format(num_cols)) - def _validate_remainder(self): - pass def _check_X(self, X): return check_array(X) @property def named_distributions_(self): - """Access the fitted transformer by name. + """Access the fitted models by name. - Read-only attribute to access any transformer by given name. + Read-only attribute to access any distribution by given name. Keys are transformer names and values are the fitted transformer objects. From 63ae994b71b284c55fb788377692bbe9b0bc1c85 Mon Sep 17 00:00:00 2001 From: raibosome Date: Thu, 6 Feb 2020 22:13:22 +0800 Subject: [PATCH 13/61] Fixed bug --- sklearn/naive_bayes.py | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 06803f4dfeb7d..fb7d31bc6c7c0 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -125,8 +125,25 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): Attributes ---------- # TODO - distributions : list of objects - list of fitted classifiers + distributions : list of tuples + List of (name, distribution, column(s)) tuples specifying the + distribution objects to be applied to subsets of the data. + + name : string + Like in Pipeline and ColumnTransformer, this allows the + distribution and its parameters to be set using ``set_params``. + distribution : estimator or {'passthrough', 'drop'} + Estimator must support :term:`fit` and :term:`transform`. + Special-cased strings 'drop' and 'passthrough' are accepted as + well, to indicate to drop the columns or to pass them through + untransformed, respectively. + column(s) : string or int, array-like of string or int, slice, \ +boolean mask array or callable + Indexes the data on its second axis. Integers are interpreted as + positional columns, while strings can reference DataFrame columns + by name. A scalar string or int should be used where + ``transformer`` expects X to be a 1d array-like (vector), + otherwise a 2d array will be passed to the transformer. Examples -------- @@ -151,6 +168,7 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): def __init__(self, distributions): self.distributions = distributions self._fits = [] + self._is_fitted = False @property def _distributions(self): @@ -195,14 +213,16 @@ def fit(self, X, y): X, y = check_X_y(X, y) y = column_or_1d(y, warn=True) - # FIXME aggregate all classes and all priors? + # Should be the same after the validation self.classes_ = np.unique(y) inits = [(_, nb, features) for (_, nb, features) in self.distributions] - self.fits_ = [(nb.fit(X[:, features], y), features) + self._fits = [(nb.fit(X[:, features], y), features) for (_, nb, features) in inits] + self._is_fitted = True + return self def _joint_log_likelihood(self, X): @@ -220,7 +240,7 @@ def _joint_log_likelihood(self, X): log_prior = log_priors[0] jlls = [nb._joint_log_likelihood(X[:, features]) - for (_, nb, features) in self._fits] + for (nb, features) in self._fits] # jlls have the shape (distribution, sample, class) jlls = np.hstack([jlls]) @@ -236,7 +256,7 @@ def _validate_distributions(self, X): valid_modules = copy.copy(__all__) valid_modules.remove("GeneralNB") - self._dict_distribution = {} + _dict_distribution = {} _list_fit_prior = [] _list_class_prior = [] @@ -282,10 +302,10 @@ def _validate_distributions(self, X): # Check the feature for feature in features: - if feature in self._dict_distribution: + if feature in _dict_distribution: raise ValueError("Duplicate specification of feature found.") else: - self._dict_distribution[feature] = model.__class__.__name__.lower() + _dict_distribution[feature] = model.__class__.__name__.lower() if len(set(_list_class_prior)) != 1: raise ValueError("The parameters 'class_prior' or 'prior' " @@ -299,12 +319,11 @@ def _validate_distributions(self, X): X = np.array(X) num_cols_expected = X.shape[-1] - num_cols = len(self._dict_distribution) + num_cols = len(_dict_distribution) if num_cols != num_cols_expected: raise ValueError("Expected {} columns".format(num_cols_expected) + "but {} were specified.".format(num_cols)) - def _check_X(self, X): return check_array(X) From d710735562e6187f900161047e1b670db467580c Mon Sep 17 00:00:00 2001 From: raibosome Date: Thu, 6 Feb 2020 22:41:25 +0800 Subject: [PATCH 14/61] Minor fixes --- sklearn/naive_bayes.py | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index fb7d31bc6c7c0..c916a81d3e0d6 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -130,7 +130,7 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): distribution objects to be applied to subsets of the data. name : string - Like in Pipeline and ColumnTransformer, this allows the + Like in Pipeline and ColumnTransformer, this allows the distribution and its parameters to be set using ``set_params``. distribution : estimator or {'passthrough', 'drop'} Estimator must support :term:`fit` and :term:`transform`. @@ -164,6 +164,7 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): 1.0 """ # TODO check_is_fitted + # TODO consider jll for each estimator def __init__(self, distributions): self.distributions = distributions @@ -171,23 +172,20 @@ def __init__(self, distributions): self._is_fitted = False @property - def _distributions(self): + def distributions_(self): return [(name, distr) for name, distr, _ in self.distributions] - @_distributions.setter - def _distributions(self, value): - # TODO wrong - print(self.distributions) - print(list(zip(value, self.distributions))) + @distributions_.setter + def distributions_(self, value): self.distributions = [ (name, distr, col) for ((name, distr), (_, _, col)) in zip(value, self.distributions)] def get_params(self, deep=True): - return self._get_params('_distributions', deep=deep) + return self._get_params('distributions_', deep=deep) def set_params(self, **kwargs): - self._set_params('_distributions', **kwargs) + self._set_params('distributions_', **kwargs) return self def fit(self, X, y): @@ -216,12 +214,10 @@ def fit(self, X, y): # Should be the same after the validation self.classes_ = np.unique(y) - inits = [(_, nb, features) for (_, nb, features) in self.distributions] + inits = [(nb, features) for (_, nb, features) in self.distributions] self._fits = [(nb.fit(X[:, features], y), features) - for (_, nb, features) in inits] - - self._is_fitted = True + for (nb, features) in inits] return self @@ -280,12 +276,13 @@ def _validate_distributions(self, X): raise ValueError("Expected tuple to have length of 3 " "but got {}".format(len(distribution))) - _, model, features = distribution + name, model, features = distribution # Check naive bayes model if callable(model): raise ValueError("Wrong format specified.") - if not (hasattr(model, "fit") or hasattr(model, "_joint_log_likelihood")): + if not (hasattr(model, "fit") + or hasattr(model, "_joint_log_likelihood")): raise TypeError("Naive bayes model should implement " "the fit and _joint_log_likelihood methods. " "{} doesn't.".format(type(model))) @@ -294,7 +291,8 @@ def _validate_distributions(self, X): "Distributions should be one of {}".format(valid_modules)) # For checking fit_prior later - _class_prior = getattr(model, "prior", None) or getattr(model, "class_prior", None) + _class_prior = getattr(model, "prior", None) or + getattr(model, "class_prior", None) _list_class_prior.append(_class_prior) _fit_prior = getattr(model, "fit_prior", True) @@ -309,12 +307,12 @@ def _validate_distributions(self, X): if len(set(_list_class_prior)) != 1: raise ValueError("The parameters 'class_prior' or 'prior' " - "must have the same values through out all models " + "must be the same values throughout all models " "if specified.") if len(set(_list_fit_prior)) != 1: raise ValueError("The parameter 'fit_prior' " - "must have the same values through out all models " + "must be the same values through out all models " "if specified.") X = np.array(X) From adc68a01326f9762fcaef57b5093fc6816d4c952 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 9 Feb 2020 13:01:17 +0800 Subject: [PATCH 15/61] Added support for pandas df --- sklearn/naive_bayes.py | 208 +++++++++++++++++++++++++++++++---------- 1 file changed, 158 insertions(+), 50 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index c916a81d3e0d6..1260f05e31560 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -22,11 +22,12 @@ import numpy as np from .base import BaseEstimator, ClassifierMixin +from .exceptions import NotFittedError from .preprocessing import binarize from .preprocessing import LabelBinarizer from .preprocessing import label_binarize from .utils.extmath import safe_sparse_dot -from .utils import check_X_y, check_array, deprecated, Bunch +from .utils import check_X_y, check_array, deprecated, Bunch, _safe_indexing from .utils.fixes import logsumexp from .utils.validation import _check_sample_weight from .utils.metaestimators import _BaseComposition @@ -122,7 +123,7 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): Read more in the :ref:`User Guide `. - Attributes + Parameters ---------- # TODO distributions : list of tuples @@ -145,49 +146,65 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): ``transformer`` expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) + class labels known to the classifier + + n_features_ : int + Number of features of each sample. + Examples -------- >>> import numpy as np + >>> import pandas as pd >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB + >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], ... [2.7, 3.8, 2.3, 1, 0], ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) + >>> X_test = np.array([[1.5, 2.3, 5.7, 0, 1]]) + >>> clf = GeneralNB([ >>> ("gaussian", GaussianNB(), [0, 1, 2]), >>> ("bernoulli", BernoulliNB(), [3, 4]) >>> ]) >>> clf.fit(X, y) - >>> print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) + >>> print(clf.predict(X_test)) + [1] + >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) + 1.0 + + >>> df = pd.DataFrame(X) + >>> df.columns = list("abcde") + >>> df["y"] = [1, 0, 0] + >>> clf = GeneralNB([ + >>> ("gaussian", GaussianNB(), ["a", "b", "c"]), + >>> ("bernoulli", BernoulliNB(), ["d", "e"]) + >>> ]) + >>> clf.fit(df.iloc[:,:-1], df["y"]) + + >>> df_test = pd.DataFrame(X_test) + >>> df_test.columns = list("abcde") + >>> print(clf.predict(df_test)) [1] >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) 1.0 """ - # TODO check_is_fitted # TODO consider jll for each estimator + # TODO unify variable names with similar meaning + # ("distribution", "model", "nb"), ("column", "feature") def __init__(self, distributions): self.distributions = distributions - self._fits = [] + self.classes_ = None + self.n_features_ = None + self._fits = None + self._columns = None + self._df_columns = None self._is_fitted = False - @property - def distributions_(self): - return [(name, distr) for name, distr, _ in self.distributions] - - @distributions_.setter - def distributions_(self, value): - self.distributions = [ - (name, distr, col) for ((name, distr), (_, _, col)) - in zip(value, self.distributions)] - - def get_params(self, deep=True): - return self._get_params('distributions_', deep=deep) - - def set_params(self, **kwargs): - self._set_params('distributions_', **kwargs) - return self - def fit(self, X, y): """Fit Gaussian Naive Bayes according to X, y @@ -198,56 +215,91 @@ def fit(self, X, y): and n_features is the number of features. y : array-like, shape (n_samples,) Target values. - distributions : list of tuples - A list of (NB, features) tuples, where NB is 'BernoulliNB', - 'GaussianNB', 'MultinomialNB', 'ComplementNB' or 'CategoricalNB', - and features is a list of indices. Returns ------- self : object """ self._validate_distributions(X) - X, y = check_X_y(X, y) - y = column_or_1d(y, warn=True) + X, y = self._check_X_y(X, y) - # Should be the same after the validation self.classes_ = np.unique(y) - inits = [(nb, features) for (_, nb, features) in self.distributions] + self._fits = [ + (nb.fit(_safe_indexing(X, features, axis=1), y), features) + for (_, nb, _), features + in zip(self.distributions, self._columns)] - self._fits = [(nb.fit(X[:, features], y), features) - for (nb, features) in inits] + self._is_fitted = True return self def _joint_log_likelihood(self, X): - """Calculate the posterior log probability of the samples X""" + """Calculate the posterior log probability of the samples X - X = np.array(X) + Parameters + ---------- + X : ndarray + + Returns + ------- + jll : ndarray, shape (1, n_classes) + Raises + ------ + NotFittedError + If estimators have not been fitted + """ + if not self._is_fitted: + raise NotFittedError("Call the fit() method first " + "before calling predict().") + + # Obtain the log priors from each fitted estimator log_priors = [ nb.class_log_prior_ if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) - for (nb, _) in self._fits] + for nb, _ in self._fits] - # Assume all class log priors are the same for all the NB's - # so we'll take the first one. - log_prior = log_priors[0] + # Take any class log prior from the estimators + if np.allclose(*log_priors): + log_prior = log_priors[0] + else: + raise ValueError("Class priors for every estimator " + "must be the same.") - jlls = [nb._joint_log_likelihood(X[:, features]) + # Obtain the jll of each fitted estimator + jlls = [nb._joint_log_likelihood( + np.array(_safe_indexing(X, features, axis=1))) for (nb, features) in self._fits] - # jlls have the shape (distribution, sample, class) + # Stack these jlls to give us + # the shape (distribution, sample, class) jlls = np.hstack([jlls]) - # Remove the class log prior from all the distributions + # Subtract the class log prior from all the jlls # but add it back after the summation jlls = jlls - log_prior jll = jlls.sum(axis=0) + log_prior return jll + @property + def distributions_(self): + return [(name, distr) for name, distr, _ in self.distributions] + + @distributions_.setter + def distributions_(self, value): + self.distributions = [ + (name, distr, col) for ((name, distr), (_, _, col)) + in zip(value, self.distributions)] + + def get_params(self, deep=True): + return self._get_params('distributions_', deep=deep) + + def set_params(self, **kwargs): + self._set_params('distributions_', **kwargs) + return self + def _validate_distributions(self, X): valid_modules = copy.copy(__all__) @@ -260,6 +312,8 @@ def _validate_distributions(self, X): names, _, _ = zip(*self.distributions) self._validate_names(names) + self._validate_column_callables(X) + # Check type if not isinstance(self.distributions, list): raise TypeError( @@ -291,8 +345,7 @@ def _validate_distributions(self, X): "Distributions should be one of {}".format(valid_modules)) # For checking fit_prior later - _class_prior = getattr(model, "prior", None) or - getattr(model, "class_prior", None) + _class_prior = getattr(model, "prior", None) or getattr(model, "class_prior", None) _list_class_prior.append(_class_prior) _fit_prior = getattr(model, "fit_prior", True) @@ -315,15 +368,70 @@ def _validate_distributions(self, X): "must be the same values through out all models " "if specified.") - X = np.array(X) - num_cols_expected = X.shape[-1] - num_cols = len(_dict_distribution) - if num_cols != num_cols_expected: - raise ValueError("Expected {} columns".format(num_cols_expected) + - "but {} were specified.".format(num_cols)) + n_features = X.shape[-1] + n_cols = len(_dict_distribution) + if n_cols != n_features: + raise ValueError("Expected {} columns".format(n_features) + + " in X but {} were specified.".format(n_cols)) + self.n_features_ = n_features + + def _validate_column_callables(self, X): + """ + Converts callable column specifications. + """ + columns = [] + for _, _, column in self.distributions: + if callable(column): + column = column(X) + columns.append(column) + self._columns = columns + + def _check_X_y(self, X, y): + """ + Validate data inputs. + + Parameters + ---------- + X ([type]): [description] + y ([type]): [description] + + Returns + ------- + X: array-like + Validated data input X + y: array-like + Validated data input y + """ + if hasattr(X, "columns"): + self._df_columns = X.columns + + return X, y def _check_X(self, X): - return check_array(X) + """ + Checks for data. This validation will be executed + before calculating the joint log likelihood. + + Parameters + ---------- + X : ndarray + + Returns + ------- + X: ndarray + validated data + """ + # Check pandas.DataFrame + if self._df_columns is not None: + + if not hasattr(X, "columns"): + raise TypeError("X should be a dataframe") + + if not all(self._df_columns == X.columns): + raise ValueError("Column names must match with " + "column names of fitted data.") + + return X @property def named_distributions_(self): From 7d9814b2b66f553baa071cc0c93d790998526c4b Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 9 Feb 2020 13:10:57 +0800 Subject: [PATCH 16/61] Update docs --- sklearn/naive_bayes.py | 53 ++++++++---------------------------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 1260f05e31560..97883e11980f9 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -125,24 +125,21 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): Parameters ---------- - # TODO distributions : list of tuples List of (name, distribution, column(s)) tuples specifying the - distribution objects to be applied to subsets of the data. + assumptions of distribution on the features to apply + naive Bayes on subsets of the data. name : string Like in Pipeline and ColumnTransformer, this allows the distribution and its parameters to be set using ``set_params``. - distribution : estimator or {'passthrough', 'drop'} - Estimator must support :term:`fit` and :term:`transform`. - Special-cased strings 'drop' and 'passthrough' are accepted as - well, to indicate to drop the columns or to pass them through - untransformed, respectively. - column(s) : string or int, array-like of string or int, slice, \ -boolean mask array or callable + distribution : Estimator + Estimator must support :term:`fit`, :term:`predict` + and :term:`_joint_log_likelihood`. + column(s) : array-like of string or int, slice Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns - by name. A scalar string or int should be used where + by name. A scalar string or int should be used where ``transformer`` expects X to be a 1d array-like (vector), otherwise a 2d array will be passed to the transformer. @@ -221,7 +218,7 @@ def fit(self, X, y): self : object """ self._validate_distributions(X) - X, y = self._check_X_y(X, y) + self._check_X_y(X, y) self.classes_ = np.unique(y) @@ -387,40 +384,10 @@ def _validate_column_callables(self, X): self._columns = columns def _check_X_y(self, X, y): - """ - Validate data inputs. - - Parameters - ---------- - X ([type]): [description] - y ([type]): [description] - - Returns - ------- - X: array-like - Validated data input X - y: array-like - Validated data input y - """ if hasattr(X, "columns"): self._df_columns = X.columns - return X, y - def _check_X(self, X): - """ - Checks for data. This validation will be executed - before calculating the joint log likelihood. - - Parameters - ---------- - X : ndarray - - Returns - ------- - X: ndarray - validated data - """ # Check pandas.DataFrame if self._df_columns is not None: @@ -430,9 +397,9 @@ def _check_X(self, X): if not all(self._df_columns == X.columns): raise ValueError("Column names must match with " "column names of fitted data.") - - return X + return X + @property def named_distributions_(self): """Access the fitted models by name. From 3bacdfa036c977a393b7af62920bc06cb499117f Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 9 Feb 2020 17:12:56 +0800 Subject: [PATCH 17/61] Updated docstring --- sklearn/naive_bayes.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 97883e11980f9..636f9681c9647 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -119,7 +119,16 @@ def predict_proba(self, X): class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): - """General Naive Bayes + """General Naive Bayes for multiple probability distributions + + The General Naive Bayes classifier is a metaestimator that + allows for multiple distributional assumptions on the features + of the data, namely the Bernoulli, Gaussian, Multinomial, and + Categorical distributions. + + This is made possible by composing a joint probability distribution + as the product of independent models or probability distributions. + Each constituent distribution is fitted on a subset of features. Read more in the :ref:`User Guide `. From b660df3105bcd9a5291afa1399cb18dd5d296b54 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 9 Feb 2020 17:47:04 +0800 Subject: [PATCH 18/61] Renamed variables --- sklearn/naive_bayes.py | 119 +++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 636f9681c9647..f4561f3712da4 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -134,15 +134,15 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): Parameters ---------- - distributions : list of tuples - List of (name, distribution, column(s)) tuples specifying the + models : list of tuples + List of (name, naive bayes estimator, column(s)) tuples specifying the assumptions of distribution on the features to apply naive Bayes on subsets of the data. name : string Like in Pipeline and ColumnTransformer, this allows the distribution and its parameters to be set using ``set_params``. - distribution : Estimator + naive bayes model : Estimator Estimator must support :term:`fit`, :term:`predict` and :term:`_joint_log_likelihood`. column(s) : array-like of string or int, slice @@ -200,13 +200,13 @@ class labels known to the classifier """ # TODO consider jll for each estimator # TODO unify variable names with similar meaning - # ("distribution", "model", "nb"), ("column", "feature") + # ("column", "feature") - def __init__(self, distributions): - self.distributions = distributions + def __init__(self, models): + self.models = models + self.models_ = None self.classes_ = None self.n_features_ = None - self._fits = None self._columns = None self._df_columns = None self._is_fitted = False @@ -226,15 +226,15 @@ def fit(self, X, y): ------- self : object """ - self._validate_distributions(X) + self._validate_models(X) self._check_X_y(X, y) self.classes_ = np.unique(y) - self._fits = [ - (nb.fit(_safe_indexing(X, features, axis=1), y), features) - for (_, nb, _), features - in zip(self.distributions, self._columns)] + self.models_ = [ + (name, nb_model.fit(_safe_indexing(X, features, axis=1), y), features) + for (name, nb_model, _), features + in zip(self.models, self._columns)] self._is_fitted = True @@ -261,25 +261,26 @@ def _joint_log_likelihood(self, X): "before calling predict().") # Obtain the log priors from each fitted estimator - log_priors = [ + all_log_priors = [ nb.class_log_prior_ - if hasattr(nb, 'class_log_prior_') else np.log(nb.class_prior_) - for nb, _ in self._fits] + if hasattr(nb_model, 'class_log_prior_') else np.log(nb.class_prior_) + for _, nb_model, _ in self.models_] # Take any class log prior from the estimators - if np.allclose(*log_priors): - log_prior = log_priors[0] + all_log_priors = np.hstack(all_log_priors) + if np.max(np.ptp(all_log_priors, axis=1)) < 1e-8: + log_prior = all_log_priors[0] else: raise ValueError("Class priors for every estimator " "must be the same.") # Obtain the jll of each fitted estimator - jlls = [nb._joint_log_likelihood( + jlls = [nb_model._joint_log_likelihood( np.array(_safe_indexing(X, features, axis=1))) - for (nb, features) in self._fits] + for (_, nb_model, features) in self.models_] # Stack these jlls to give us - # the shape (distribution, sample, class) + # the shape (estimator, sample, class) jlls = np.hstack([jlls]) # Subtract the class log prior from all the jlls @@ -290,92 +291,92 @@ def _joint_log_likelihood(self, X): return jll @property - def distributions_(self): - return [(name, distr) for name, distr, _ in self.distributions] + def models(self): + return self.models - @distributions_.setter - def distributions_(self, value): - self.distributions = [ - (name, distr, col) for ((name, distr), (_, _, col)) - in zip(value, self.distributions)] + @models.setter + def models(self, value): + self.models = [(name, nb_model, col) + for (name, nb_model, col) in value] + self._is_fitted = False def get_params(self, deep=True): - return self._get_params('distributions_', deep=deep) + return self._get_params('models_', deep=deep) def set_params(self, **kwargs): - self._set_params('distributions_', **kwargs) + self._set_params('models_', **kwargs) return self - def _validate_distributions(self, X): + def _validate_models(self, X): valid_modules = copy.copy(__all__) valid_modules.remove("GeneralNB") - _dict_distribution = {} + _dict_model = {} _list_fit_prior = [] _list_class_prior = [] - names, _, _ = zip(*self.distributions) + names, _, _ = zip(*self.models) self._validate_names(names) self._validate_column_callables(X) # Check type - if not isinstance(self.distributions, list): + if not isinstance(self.models, list): raise TypeError( - "Expected list but got {}".format(type(self.distributions))) + "Expected list but got {}".format(type(self.models))) - for distribution in self.distributions: + for model in self.models: # Check type - if not isinstance(distribution, tuple): + if not isinstance(model, tuple): raise TypeError( "Expected list of tuples " - "but got list of {}s".format(type(distribution))) - if len(distribution) != 3: + "but got list of {}s".format(type(model))) + if len(model) != 3: raise ValueError("Expected tuple to have length of 3 " - "but got {}".format(len(distribution))) + "but got {}".format(len(model))) - name, model, features = distribution + name, estimator, features = model - # Check naive bayes model - if callable(model): + # Check naive bayes estimator + if callable(estimator): raise ValueError("Wrong format specified.") - if not (hasattr(model, "fit") - or hasattr(model, "_joint_log_likelihood")): - raise TypeError("Naive bayes model should implement " + if not (hasattr(estimator, "fit") + or hasattr(estimator, "_joint_log_likelihood")): + raise TypeError("Naive bayes estimator should implement " "the fit and _joint_log_likelihood methods. " - "{} doesn't.".format(type(model))) - if model.__class__.__name__ not in valid_modules: + "{} doesn't.".format(type(estimator))) + if estimator.__class__.__name__ not in valid_modules: raise ValueError( "Distributions should be one of {}".format(valid_modules)) # For checking fit_prior later - _class_prior = getattr(model, "prior", None) or getattr(model, "class_prior", None) + _class_prior = getattr(estimator, "prior", None) or getattr(estimator, "class_prior", None) _list_class_prior.append(_class_prior) - _fit_prior = getattr(model, "fit_prior", True) + _fit_prior = getattr(estimator, "fit_prior", True) _list_fit_prior.append(_fit_prior) # Check the feature for feature in features: - if feature in _dict_distribution: + if feature in _dict_model: raise ValueError("Duplicate specification of feature found.") else: - _dict_distribution[feature] = model.__class__.__name__.lower() + _dict_model[feature] = estimator.__class__.__name__.lower() if len(set(_list_class_prior)) != 1: raise ValueError("The parameters 'class_prior' or 'prior' " - "must be the same values throughout all models " + "must be the same values throughout all estimators " "if specified.") if len(set(_list_fit_prior)) != 1: raise ValueError("The parameter 'fit_prior' " - "must be the same values through out all models " + "must be the same values through out all estimators " "if specified.") n_features = X.shape[-1] - n_cols = len(_dict_distribution) + n_cols = len(_dict_model) if n_cols != n_features: raise ValueError("Expected {} columns".format(n_features) + " in X but {} were specified.".format(n_cols)) @@ -386,7 +387,7 @@ def _validate_column_callables(self, X): Converts callable column specifications. """ columns = [] - for _, _, column in self.distributions: + for _, _, column in self.models: if callable(column): column = column(X) columns.append(column) @@ -410,8 +411,8 @@ def _check_X(self, X): return X @property - def named_distributions_(self): - """Access the fitted models by name. + def named_models_(self): + """Access the fitted estimators by name. Read-only attribute to access any distribution by given name. Keys are transformer names and values are the fitted transformer @@ -419,8 +420,8 @@ def named_distributions_(self): """ # Use Bunch object to improve autocomplete - return Bunch(**{name: trans for name, trans, _ - in self.distributions}) + return Bunch(**{name: estimator for name, estimator, _ + in self.models}) From 531fea890a9972042fa5db98e6b75f4d902235ad Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 9 Feb 2020 17:48:33 +0800 Subject: [PATCH 19/61] Removed getter and setter --- sklearn/naive_bayes.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index f4561f3712da4..e18136ecd5e55 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -290,16 +290,6 @@ def _joint_log_likelihood(self, X): return jll - @property - def models(self): - return self.models - - @models.setter - def models(self, value): - self.models = [(name, nb_model, col) - for (name, nb_model, col) in value] - self._is_fitted = False - def get_params(self, deep=True): return self._get_params('models_', deep=deep) From fa726850b43232944e7a883e3a44a143c5b1676d Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 9 Feb 2020 18:25:10 +0800 Subject: [PATCH 20/61] Refactored variable names --- sklearn/naive_bayes.py | 58 +++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index e18136ecd5e55..6302038ba36e0 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -207,7 +207,7 @@ def __init__(self, models): self.models_ = None self.classes_ = None self.n_features_ = None - self._columns = None + self._cols = None self._df_columns = None self._is_fitted = False @@ -232,9 +232,9 @@ def fit(self, X, y): self.classes_ = np.unique(y) self.models_ = [ - (name, nb_model.fit(_safe_indexing(X, features, axis=1), y), features) - for (name, nb_model, _), features - in zip(self.models, self._columns)] + (name, nb_model.fit(_safe_indexing(X, cols, axis=1), y), cols) + for (name, nb_model, _), cols + in zip(self.models, self._cols)] self._is_fitted = True @@ -262,13 +262,13 @@ def _joint_log_likelihood(self, X): # Obtain the log priors from each fitted estimator all_log_priors = [ - nb.class_log_prior_ - if hasattr(nb_model, 'class_log_prior_') else np.log(nb.class_prior_) + nb_model.class_log_prior_ + if hasattr(nb_model, 'class_log_prior_') else np.log(nb_model.class_prior_) for _, nb_model, _ in self.models_] - # Take any class log prior from the estimators - all_log_priors = np.hstack(all_log_priors) - if np.max(np.ptp(all_log_priors, axis=1)) < 1e-8: + # Ensure class log priors are the same for all estimators + all_log_priors = np.hstack([all_log_priors]) + if np.max(np.ptp(all_log_priors, axis=0)) < 1e-6: log_prior = all_log_priors[0] else: raise ValueError("Class priors for every estimator " @@ -276,8 +276,8 @@ def _joint_log_likelihood(self, X): # Obtain the jll of each fitted estimator jlls = [nb_model._joint_log_likelihood( - np.array(_safe_indexing(X, features, axis=1))) - for (_, nb_model, features) in self.models_] + np.array(_safe_indexing(X, cols, axis=1))) + for (_, nb_model, cols) in self.models_] # Stack these jlls to give us # the shape (estimator, sample, class) @@ -290,11 +290,21 @@ def _joint_log_likelihood(self, X): return jll + @property + def _models(self): + return [(name, model) for name, model, _ in self.models] + + @_models.setter + def _models(self, value): + self.models = [ + (name, nb_model, cols) for ((name, nb_model), (_, _, cols)) + in zip(value, self.models)] + def get_params(self, deep=True): - return self._get_params('models_', deep=deep) + return self._get_params('_models', deep=deep) def set_params(self, **kwargs): - self._set_params('models_', **kwargs) + self._set_params('_models', **kwargs) return self def _validate_models(self, X): @@ -327,7 +337,7 @@ def _validate_models(self, X): raise ValueError("Expected tuple to have length of 3 " "but got {}".format(len(model))) - name, estimator, features = model + name, estimator, cols = model # Check naive bayes estimator if callable(estimator): @@ -349,11 +359,11 @@ def _validate_models(self, X): _list_fit_prior.append(_fit_prior) # Check the feature - for feature in features: - if feature in _dict_model: - raise ValueError("Duplicate specification of feature found.") + for col in cols: + if col in _dict_model: + raise ValueError("Duplicate specification of col found.") else: - _dict_model[feature] = estimator.__class__.__name__.lower() + _dict_model[col] = estimator.__class__.__name__.lower() if len(set(_list_class_prior)) != 1: raise ValueError("The parameters 'class_prior' or 'prior' " @@ -376,12 +386,12 @@ def _validate_column_callables(self, X): """ Converts callable column specifications. """ - columns = [] - for _, _, column in self.models: - if callable(column): - column = column(X) - columns.append(column) - self._columns = columns + cols = [] + for _, _, col in self.models: + if callable(col): + col = col(X) + cols.append(col) + self._cols = cols def _check_X_y(self, X, y): if hasattr(X, "columns"): From 360cfda4ddea2a61d0f2cbc600041238d7ca86c6 Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 11 Feb 2020 20:25:42 +0800 Subject: [PATCH 21/61] Renamed variable names --- sklearn/naive_bayes.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 6302038ba36e0..8bd2cbf3cb226 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -208,7 +208,7 @@ def __init__(self, models): self.classes_ = None self.n_features_ = None self._cols = None - self._df_columns = None + self._df_cols = None self._is_fitted = False def fit(self, X, y): @@ -386,25 +386,24 @@ def _validate_column_callables(self, X): """ Converts callable column specifications. """ - cols = [] - for _, _, col in self.models: - if callable(col): - col = col(X) - cols.append(col) - self._cols = cols + self._cols = [] + for _, _, cols in self.models: + if callable(cols): + cols = cols(X) + self._cols.append(cols) def _check_X_y(self, X, y): if hasattr(X, "columns"): - self._df_columns = X.columns + self._df_cols = X.columns def _check_X(self, X): # Check pandas.DataFrame - if self._df_columns is not None: + if self._df_cols is not None: if not hasattr(X, "columns"): raise TypeError("X should be a dataframe") - if not all(self._df_columns == X.columns): + if not all(self._df_cols == X.columns): raise ValueError("Column names must match with " "column names of fitted data.") From 7c11ea0f10d07ba2c4819e9ee6b3825408aff6ac Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 11 Feb 2020 20:59:08 +0800 Subject: [PATCH 22/61] Moved _validate_callables into _validate_models --- sklearn/naive_bayes.py | 56 ++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 8bd2cbf3cb226..33a19bf54ec69 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -311,24 +311,23 @@ def _validate_models(self, X): valid_modules = copy.copy(__all__) valid_modules.remove("GeneralNB") - _dict_model = {} - _list_fit_prior = [] - _list_class_prior = [] - - names, _, _ = zip(*self.models) - self._validate_names(names) - - self._validate_column_callables(X) + self._cols = [] + dict_col2model = {} + all_fit_priors = [] + all_class_priors = [] # Check type if not isinstance(self.models, list): raise TypeError( "Expected list but got {}".format(type(self.models))) + names, _, _ = zip(*self.models) + self._validate_names(names) + for model in self.models: - # Check type + # Check type of each entry in list if not isinstance(model, tuple): raise TypeError( "Expected list of tuples " @@ -337,9 +336,10 @@ def _validate_models(self, X): raise ValueError("Expected tuple to have length of 3 " "but got {}".format(len(model))) - name, estimator, cols = model + _, estimator, cols = model - # Check naive bayes estimator + # Check naive bayes estimator for format + # `fit` and `_joint_log_likelihood` attributes if callable(estimator): raise ValueError("Wrong format specified.") if not (hasattr(estimator, "fit") @@ -351,32 +351,39 @@ def _validate_models(self, X): raise ValueError( "Distributions should be one of {}".format(valid_modules)) - # For checking fit_prior later - _class_prior = getattr(estimator, "prior", None) or getattr(estimator, "class_prior", None) - _list_class_prior.append(_class_prior) - - _fit_prior = getattr(estimator, "fit_prior", True) - _list_fit_prior.append(_fit_prior) + # Check naive bayes estimator for attributes + # `prior` and `fit_prior` + class_prior = getattr(estimator, "prior", None) or getattr(estimator, "class_prior", None) + fit_prior = getattr(estimator, "fit_prior", True) + all_class_priors.append(class_prior) + all_fit_priors.append(fit_prior) - # Check the feature + # Check the columns for duplicate models + # and convert to list if callable + if callable(cols): + cols = cols(X) for col in cols: - if col in _dict_model: + if col in dict_col2model: raise ValueError("Duplicate specification of col found.") else: - _dict_model[col] = estimator.__class__.__name__.lower() + dict_col2model[col] = estimator.__class__.__name__.lower() + self._cols.append(cols) - if len(set(_list_class_prior)) != 1: + # Check if class priors are the same throughout all estimators + if len(set(all_class_priors)) != 1: raise ValueError("The parameters 'class_prior' or 'prior' " "must be the same values throughout all estimators " "if specified.") - if len(set(_list_fit_prior)) != 1: + # FIXME really? + # Check if class priors are the same throughout all estimators + if len(set(all_fit_priors)) != 1: raise ValueError("The parameter 'fit_prior' " "must be the same values through out all estimators " "if specified.") n_features = X.shape[-1] - n_cols = len(_dict_model) + n_cols = len(dict_col2model) if n_cols != n_features: raise ValueError("Expected {} columns".format(n_features) + " in X but {} were specified.".format(n_cols)) @@ -384,7 +391,7 @@ def _validate_models(self, X): def _validate_column_callables(self, X): """ - Converts callable column specifications. + Preprocess callable column specifications for later use """ self._cols = [] for _, _, cols in self.models: @@ -393,6 +400,7 @@ def _validate_column_callables(self, X): self._cols.append(cols) def _check_X_y(self, X, y): + # Delay further checks on X y to the respective estimators if hasattr(X, "columns"): self._df_cols = X.columns From b067e079acba3f537d1a711ff92475456ec613de Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 11 Feb 2020 21:04:31 +0800 Subject: [PATCH 23/61] Added temporary docstring for callables --- sklearn/naive_bayes.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 33a19bf54ec69..5ee45900bad10 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -165,12 +165,18 @@ class labels known to the classifier >>> import numpy as np >>> import pandas as pd >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB + >>> from sklearn.compose import make_column_selector >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], - ... [2.7, 3.8, 2.3, 1, 0], - ... [1.7, 0.1, 4.5, 1, 0]]) + >>> [2.7, 3.8, 2.3, 1, 0], + >>> [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) >>> X_test = np.array([[1.5, 2.3, 5.7, 0, 1]]) + >>> df = pd.DataFrame(X) + >>> df.columns = list("abcde") + >>> df["y"] = [1, 0, 0] + >>> df_test = pd.DataFrame(X_test) + >>> df_test.columns = list("abcde") >>> clf = GeneralNB([ >>> ("gaussian", GaussianNB(), [0, 1, 2]), @@ -182,17 +188,21 @@ class labels known to the classifier >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) 1.0 - >>> df = pd.DataFrame(X) - >>> df.columns = list("abcde") - >>> df["y"] = [1, 0, 0] >>> clf = GeneralNB([ >>> ("gaussian", GaussianNB(), ["a", "b", "c"]), >>> ("bernoulli", BernoulliNB(), ["d", "e"]) >>> ]) >>> clf.fit(df.iloc[:,:-1], df["y"]) + >>> print(clf.predict(df_test)) + [1] + >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) + 1.0 - >>> df_test = pd.DataFrame(X_test) - >>> df_test.columns = list("abcde") + >>> clf = GeneralNB([ + >>> ("gaussian", GaussianNB(), make_column_selector(pattern=r"[abc]")), + >>> ("bernoulli", BernoulliNB(), make_column_selector(pattern=r"[de]")) + >>> ]) + >>> clf.fit(df.iloc[:,:-1], df["y"]) >>> print(clf.predict(df_test)) [1] >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) From 6d510b0388618c9b8536b8d830d9db6a711ebd1f Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 11 Feb 2020 21:06:27 +0800 Subject: [PATCH 24/61] Removed _validate_column_callables --- sklearn/naive_bayes.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 5ee45900bad10..25ee8cce804bd 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -327,7 +327,7 @@ def _validate_models(self, X): all_fit_priors = [] all_class_priors = [] - # Check type + # Check type of `models` parameter if not isinstance(self.models, list): raise TypeError( "Expected list but got {}".format(type(self.models))) @@ -398,16 +398,6 @@ def _validate_models(self, X): raise ValueError("Expected {} columns".format(n_features) + " in X but {} were specified.".format(n_cols)) self.n_features_ = n_features - - def _validate_column_callables(self, X): - """ - Preprocess callable column specifications for later use - """ - self._cols = [] - for _, _, cols in self.models: - if callable(cols): - cols = cols(X) - self._cols.append(cols) def _check_X_y(self, X, y): # Delay further checks on X y to the respective estimators From 48f3bad3fe120a5cf135f3b8dfc1391b2303c7d3 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 16 Feb 2020 23:34:54 +0800 Subject: [PATCH 25/61] Added docs [WIP] --- doc/modules/naive_bayes.rst | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index 01720ba4857ab..f1167286615ab 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -264,7 +264,39 @@ of feature :math:`i`. General Naive Bayes ------------------- -[WIP] +:class:`GeneralNB` implements multiple naive Bayes algorithms across the +features in the dataset. Unlike other naive Bayes algorithms in this module, +:class:`GeneralNB` can assume different distributions for different features. +The naive Bayes algorithms correspond to the probability distributions +that the respective features are assumed to follow. + +A use case for this metaestimator is when the dataset consists of obvious +distributions like the categorical and Gaussian distributions. We assume +that the features are independent of each other and some features follow +the categorical distributions, while the rest follow Gaussian. + +The posterior distribution is the same, except that the features come from +the different distributions: + +.. math:: + + p(y|x_1, x_2, x_3) = p(y|x_1) p(y|x_2) p(y|x_3) + +where + +.. math:: + + X_1 ~ Categorical(p) + X_2 ~ N(0,1) + X_3 ~ Categorical(4) + +Specifying the different naive Bayes models is similar to that of +ColumnTransformer - you specify a name, the naive Bayes model and the +columns (features) that follow this model. + +As :class:`GeneralNB` is a metaestimator, it follows that the data +requirements of each feature depend on the requirements imposed +by the naive Bayes estimators specified. Out-of-core naive Bayes model fitting ------------------------------------- From be66c63cbffda42a48116c2ea68c15558e5dfafd Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 17 Feb 2020 20:13:07 +0800 Subject: [PATCH 26/61] Minor fixes --- sklearn/naive_bayes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 25ee8cce804bd..4b00f5aad52ee 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -341,7 +341,7 @@ def _validate_models(self, X): if not isinstance(model, tuple): raise TypeError( "Expected list of tuples " - "but got list of {}s".format(type(model))) + "but got list of {}".format(type(model))) if len(model) != 3: raise ValueError("Expected tuple to have length of 3 " "but got {}".format(len(model))) @@ -351,7 +351,7 @@ def _validate_models(self, X): # Check naive bayes estimator for format # `fit` and `_joint_log_likelihood` attributes if callable(estimator): - raise ValueError("Wrong format specified.") + raise ValueError("Estimator should be a callable specified.") if not (hasattr(estimator, "fit") or hasattr(estimator, "_joint_log_likelihood")): raise TypeError("Naive bayes estimator should implement " @@ -363,7 +363,7 @@ def _validate_models(self, X): # Check naive bayes estimator for attributes # `prior` and `fit_prior` - class_prior = getattr(estimator, "prior", None) or getattr(estimator, "class_prior", None) + class_prior = getattr(estimator, "priors", None) or getattr(estimator, "class_prior", None) fit_prior = getattr(estimator, "fit_prior", True) all_class_priors.append(class_prior) all_fit_priors.append(fit_prior) @@ -374,7 +374,7 @@ def _validate_models(self, X): cols = cols(X) for col in cols: if col in dict_col2model: - raise ValueError("Duplicate specification of col found.") + raise ValueError("Duplicate specification of column found.") else: dict_col2model[col] = estimator.__class__.__name__.lower() self._cols.append(cols) From f0e356b75d204753b92eda1b484e8d7b9832fcc5 Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 17 Feb 2020 20:13:25 +0800 Subject: [PATCH 27/61] [WIP] added tests --- sklearn/tests/test_naive_bayes.py | 85 ++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 0f897641204fc..6d789b1e099dc 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -39,47 +39,74 @@ y2 = np.array([1, 1, 2, 2, 3, 3]) -def test_generalnb_correctness(): - X = np.array([[1.5, 2.3, 5.7, 0, 1], - [2.7, 3.8, 2.3, 1, 0], - [1.7, 0.1, 4.5, 1, 0]]) - y = np.array([1, 0, 0]) - clf = GeneralNB() - clf.fit(X, y, [(GaussianNB(), [0, 1, 2]), - (BernoulliNB(), [3, 4])]) - print(clf.predict([[1.5, 2.3, 5.7, 0, 1]])) - print(clf.score([[2.7, 3.8, 1, 0, 1]], [0])) +def test_generalnb_models_wrong_values(): + clf = GeneralNB([ + ("gaussian", GaussianNB()), + ("bernoulli", BernoulliNB()) + ]) + assert_raises(ValueError, clf.fit, X, y) + + clf = GeneralNB([ + (5, GaussianNB(), [0], [5]), + (9, BernoulliNB(), [1], [1]) + ]) + assert_raises(ValueError, clf.fit, X, y) -def test_generalnb_distributions_insufficient(): - clf = GeneralNB() - assert_raises(ValueError, clf.fit, X, y, [(GaussianNB(), [0])]) +def test_generalnb_models_wrong_type(): + clf = GeneralNB(( + ("gaussian", GaussianNB(), [0]), + ("bernoulli", BernoulliNB(), [1]) + )) + assert_raises(TypeError, clf.fit, X, y) + + clf = GeneralNB(( + ["gaussian", GaussianNB(), [0]], + ["bernoulli", BernoulliNB(), [1]] + )) + assert_raises(TypeError, clf.fit, X, y) + + clf = GeneralNB([ + ["gaussian", GaussianNB(), [0]], + ["bernoulli", BernoulliNB(), [1]] + ]) + assert_raises(TypeError, clf.fit, X, y) + + clf = GeneralNB([ + (5, GaussianNB(), [0]), + (9, BernoulliNB(), [1]) + ]) + assert_raises(TypeError, clf.fit, X, y) + + clf = GeneralNB([ + ("gaussian", GaussianNB, [0]), + ("bernoulli", BernoulliNB, [1]) + ]) + assert_raises(ValueError, clf.fit, X, y) -def test_generalnb_distributions_duplicate(): - clf = GeneralNB() - assert_raises(ValueError, clf.fit, X, y, [ - (GaussianNB(), [0, 1]), (GaussianNB(), [1])]) +def test_generalnb_models_too_few_cols(): + pass -def test_generalnb_distributions_unknown(): - clf = GeneralNB() - assert_raises(ValueError, clf.fit, X, y, [(GeneralNB(), [0, 1])]) +def test_generalnb_models_unknown(): + pass -def test_generalnb_distributions_wrong_type(): - clf = GeneralNB() - assert_raises(TypeError, clf.fit, X, y, [[GaussianNB(), [0, 1]]]) +def test_generalnb_correctness(): + pass -def test_generalnb_distributions_tuple_too_long(): - clf = GeneralNB() - assert_raises(ValueError, clf.fit, X, y, [(GaussianNB(), [0, 1], [3])]) +def test_generalnb_models_duplicate(): + clf = GeneralNB([ + ("gaussian1", GaussianNB(), [0, 1]), + ("gaussian2", GaussianNB(), [1])] + ) + assert_raises(ValueError, clf.fit, X, y) -def test_generalnb_distributions_wrong_format(): - clf = GeneralNB() - assert_raises(ValueError, clf.fit, X, y, [(GaussianNB, [0, 1])]) +def test_generalnb_different_class_priors(): + pass def test_pickle(): From f9b1dfbf96547f2b23d11fc634a04856b44c46a6 Mon Sep 17 00:00:00 2001 From: raibosome Date: Wed, 19 Feb 2020 18:43:49 +0800 Subject: [PATCH 28/61] Updated docs --- doc/modules/naive_bayes.rst | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index f1167286615ab..bb90e0bdf59ae 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -268,7 +268,14 @@ General Naive Bayes features in the dataset. Unlike other naive Bayes algorithms in this module, :class:`GeneralNB` can assume different distributions for different features. The naive Bayes algorithms correspond to the probability distributions -that the respective features are assumed to follow. +that the respective features are assumed to follow. Bayes' theorem states the following +relationship, given class variable :math:`y` and dependent feature +vector :math:`x_1` through :math:`x_n`, : + +.. math:: + + P(y \mid x_1, \dots, x_n) = \frac{P(y) P(x_1, \dots x_n \mid y)} + {P(x_1, \dots, x_n)} A use case for this metaestimator is when the dataset consists of obvious distributions like the categorical and Gaussian distributions. We assume From eeaaf16b1756db2cf1dd59b7e4987f9fe468aeba Mon Sep 17 00:00:00 2001 From: raibosome Date: Wed, 19 Feb 2020 21:19:35 +0800 Subject: [PATCH 29/61] Added GeneralNB module to docs --- doc/modules/classes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 752b41151fca0..20dd2b5673ce2 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1260,6 +1260,7 @@ Model validation naive_bayes.ComplementNB naive_bayes.GaussianNB naive_bayes.MultinomialNB + naive_bayes.GeneralNB .. _neighbors_ref: From 496c069ec72b391368115d5585412b23251eb74f Mon Sep 17 00:00:00 2001 From: raibosome Date: Wed, 19 Feb 2020 21:19:56 +0800 Subject: [PATCH 30/61] Added code snippets to docs --- doc/modules/naive_bayes.rst | 65 ++++++++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 4 deletions(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index bb90e0bdf59ae..40e554aabb7ac 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -268,7 +268,8 @@ General Naive Bayes features in the dataset. Unlike other naive Bayes algorithms in this module, :class:`GeneralNB` can assume different distributions for different features. The naive Bayes algorithms correspond to the probability distributions -that the respective features are assumed to follow. Bayes' theorem states the following +that the respective features are assumed to follow. Bayes' theorem states +the following relationship, given class variable :math:`y` and dependent feature vector :math:`x_1` through :math:`x_n`, : @@ -293,18 +294,74 @@ where .. math:: - X_1 ~ Categorical(p) - X_2 ~ N(0,1) + X_1 ~ Categorical(p) \\ + X_2 ~ N(0,1) \\ X_3 ~ Categorical(4) Specifying the different naive Bayes models is similar to that of ColumnTransformer - you specify a name, the naive Bayes model and the -columns (features) that follow this model. +column indices or names of the features that follow a distribution +corresponding to this model. As :class:`GeneralNB` is a metaestimator, it follows that the data requirements of each feature depend on the requirements imposed by the naive Bayes estimators specified. +GeneralNB is particularly useful for datasets + + >>> import numpy as np + >>> import pandas as pd + >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB + + >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], + >>> [2.7, 3.8, 2.3, 1, 0], + >>> [1.7, 0.1, 4.5, 1, 0]]) + >>> y = np.array([1, 0, 0]) + >>> X_test = np.array([[1.5, 2.3, 5.7, 0, 1]]) + + >>> clf = GeneralNB([ + >>> ("gaussian", GaussianNB(), [0, 1, 2]), + >>> ("bernoulli", BernoulliNB(), [3, 4]) + >>> ]) + >>> clf.fit(X, y) + >>> print(clf.predict(X_test)) + [1] + >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) + 1.0 + +You can also specify the column names of a pandas DataFrame + + >>> df = pd.DataFrame(X) + >>> df.columns = ["a", "b", "c", "d", "e"] + >>> df["y"] = [1, 0, 0] + >>> df_test = pd.DataFrame(X_test) + >>> df_test.columns = ["a", "b", "c", "d", "e"] + + >>> clf = GeneralNB([ + >>> ("gaussian", GaussianNB(), ["a", "b", "c"]), + >>> ("bernoulli", BernoulliNB(), ["d", "e"]) + >>> ]) + >>> clf.fit(df.iloc[:,:-1], df["y"]) + >>> print(clf.predict(df_test)) + [1] + +You may also select columns from `make_column_selector`. + + >>> from sklearn.compose import make_column_selector + >>> clf = GeneralNB([ + >>> ("gaussian", GaussianNB(), make_column_selector(pattern=r"[abc]")), + >>> ("bernoulli", BernoulliNB(), make_column_selector(pattern=r"[de]")) + >>> ]) + >>> clf.fit(df.iloc[:,:-1], df["y"]) + >>> print(clf.predict(df_test)) + [1] + +You can also access the attributes and methods of the fitted estimators using +the `named_models_` computed property. From the previous example, + + >>> clf.named_models_.bernoulli.var_smoothing + 1e-09 + Out-of-core naive Bayes model fitting ------------------------------------- From b2ab2b59af7af62881882341298ac3f83a9ad2cc Mon Sep 17 00:00:00 2001 From: raibosome Date: Wed, 19 Feb 2020 21:20:30 +0800 Subject: [PATCH 31/61] Added more GeneralNB tests --- sklearn/tests/test_naive_bayes.py | 59 +++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 6 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 6d789b1e099dc..c368fd457bcf8 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -40,12 +40,23 @@ def test_generalnb_models_wrong_values(): + """Test if wrong specification of models raises ValueError""" + + # Tuple has length less than 3 clf = GeneralNB([ ("gaussian", GaussianNB()), ("bernoulli", BernoulliNB()) ]) assert_raises(ValueError, clf.fit, X, y) + # Tuple has length more than 3 + clf = GeneralNB([ + ("gaussian", GaussianNB(), [0], [5]), + ("bernoulli", BernoulliNB(), [1], [1]) + ]) + assert_raises(ValueError, clf.fit, X, y) + + # Tuple has length more than 3 clf = GeneralNB([ (5, GaussianNB(), [0], [5]), (9, BernoulliNB(), [1], [1]) @@ -54,6 +65,7 @@ def test_generalnb_models_wrong_values(): def test_generalnb_models_wrong_type(): + """Test if wrong specification of models raises TypeError""" clf = GeneralNB(( ("gaussian", GaussianNB(), [0]), ("bernoulli", BernoulliNB(), [1]) @@ -86,18 +98,41 @@ def test_generalnb_models_wrong_type(): def test_generalnb_models_too_few_cols(): - pass + """Test specifying fewer cols than no. of cols in X""" + clf = GeneralNB([ + ("gaussian", GaussianNB(), [0]) + ]) + assert_raises(ValueError, clf.fit, X, y) -def test_generalnb_models_unknown(): - pass +# FIXME +def test_generalnb_joint_log_likelihood(): + """Test whether joint log likelihood has been computed correctly""" + clf = GeneralNB([ + ("gaussian", GaussianNB(), [0]), + ("bernoulli", BernoulliNB(), [1])] + ) + assert_raises(ValueError, clf.fit, X, y) + + # Get jll from Gaussian + clf_gnb = GaussianNB() + clf_gnb.fit(X[:,0,None], y) + jll_gnb = clf_gnb._joint_log_likelihood(X[:,1,None]) + clp_gnb = np.log(clf_gnb.class_prior_) + + # Get jll from Bernoulli + clf_bnl = BernoulliNB() + clf_bnl.fit(X[:,1,None], y) + jll_bnl = clf_bnl._joint_log_likelihood(X[:,1,None]) + clp_bnl = clf_bnl.class_log_prior_ + + (jll_gnb - clp_gnb + jll_bnl - clp_bnl) + clp_bnl -def test_generalnb_correctness(): - pass def test_generalnb_models_duplicate(): + """Test if specifying duplicate columns in models raises error""" clf = GeneralNB([ ("gaussian1", GaussianNB(), [0, 1]), ("gaussian2", GaussianNB(), [1])] @@ -106,7 +141,19 @@ def test_generalnb_models_duplicate(): def test_generalnb_different_class_priors(): - pass + clf = GeneralNB([ + ("bernoulli1", BernoulliNB(class_prior=[0.5,0.5]), [0]), + ("bernoulli2", BernoulliNB(class_prior=[0.8,0.2]), [1]) + ]) + assert_raises(ValueError, clf.fit, X, y) + + +def test_generalnb_different_fit_priors(): + clf = GeneralNB([ + ("gaussian", GaussianNB(), [0]), + ("bernoulli", BernoulliNB(fit_prior=False), [1]) + ]) + assert_raises(ValueError, clf.fit, X, y) def test_pickle(): From 78c656af2e17d3721cd7ce10bb35233eab8a07fb Mon Sep 17 00:00:00 2001 From: raibosome Date: Wed, 19 Feb 2020 21:21:20 +0800 Subject: [PATCH 32/61] Update docstrings and raising ValueError --- sklearn/naive_bayes.py | 148 ++++++++++++++++++----------------------- 1 file changed, 66 insertions(+), 82 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 4b00f5aad52ee..c017f04f283ad 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -119,13 +119,13 @@ def predict_proba(self, X): class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): - """General Naive Bayes for multiple probability distributions + """Naive Bayes metaclassifier for multiple naive Bayes models + + The General Naive Bayes classifier is a metaestimator that + allows for multiple distributional assumptions on the features + of the data like the Bernoulli, Gaussian, multinomial, and + categorical distributions. - The General Naive Bayes classifier is a metaestimator that - allows for multiple distributional assumptions on the features - of the data, namely the Bernoulli, Gaussian, Multinomial, and - Categorical distributions. - This is made possible by composing a joint probability distribution as the product of independent models or probability distributions. Each constituent distribution is fitted on a subset of features. @@ -142,10 +142,10 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): name : string Like in Pipeline and ColumnTransformer, this allows the distribution and its parameters to be set using ``set_params``. - naive bayes model : Estimator + naive Bayes model : Estimator Estimator must support :term:`fit`, :term:`predict` and :term:`_joint_log_likelihood`. - column(s) : array-like of string or int, slice + column(s) : array-like of string or int, slicec Indexes the data on its second axis. Integers are interpreted as positional columns, while strings can reference DataFrame columns by name. A scalar string or int should be used where @@ -154,8 +154,12 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): Attributes ---------- + models_ : list of tuples + Verified list of (name, naive bayes estimator, column(s)) + based on `self.models`. + classes_ : ndarray of shape (n_classes,) - class labels known to the classifier + Class labels known to the classifier. n_features_ : int Number of features of each sample. @@ -163,54 +167,19 @@ class labels known to the classifier Examples -------- >>> import numpy as np - >>> import pandas as pd - >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB - >>> from sklearn.compose import make_column_selector - + >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, Categorical >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], - >>> [2.7, 3.8, 2.3, 1, 0], - >>> [1.7, 0.1, 4.5, 1, 0]]) + ... [2.7, 3.8, 2.3, 1, 0], + ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) - >>> X_test = np.array([[1.5, 2.3, 5.7, 0, 1]]) - >>> df = pd.DataFrame(X) - >>> df.columns = list("abcde") - >>> df["y"] = [1, 0, 0] - >>> df_test = pd.DataFrame(X_test) - >>> df_test.columns = list("abcde") - >>> clf = GeneralNB([ - >>> ("gaussian", GaussianNB(), [0, 1, 2]), - >>> ("bernoulli", BernoulliNB(), [3, 4]) + ... ("gaussian", GaussianNB(), [0, 1, 2]), + ... ("categorical", CategoricalNB(), [3, 4]) >>> ]) >>> clf.fit(X, y) - >>> print(clf.predict(X_test)) - [1] - >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) - 1.0 - - >>> clf = GeneralNB([ - >>> ("gaussian", GaussianNB(), ["a", "b", "c"]), - >>> ("bernoulli", BernoulliNB(), ["d", "e"]) - >>> ]) - >>> clf.fit(df.iloc[:,:-1], df["y"]) - >>> print(clf.predict(df_test)) + >>> clf.predict(X[0]) [1] - >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) - 1.0 - - >>> clf = GeneralNB([ - >>> ("gaussian", GaussianNB(), make_column_selector(pattern=r"[abc]")), - >>> ("bernoulli", BernoulliNB(), make_column_selector(pattern=r"[de]")) - >>> ]) - >>> clf.fit(df.iloc[:,:-1], df["y"]) - >>> print(clf.predict(df_test)) - [1] - >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) - 1.0 """ - # TODO consider jll for each estimator - # TODO unify variable names with similar meaning - # ("column", "feature") def __init__(self, models): self.models = models @@ -222,13 +191,13 @@ def __init__(self, models): self._is_fitted = False def fit(self, X, y): - """Fit Gaussian Naive Bayes according to X, y + """Fit X and y to the specified naive Bayes estimators. Parameters ---------- X : array-like, shape (n_samples, n_features) - Training vectors, where n_samples is the number of samples - and n_features is the number of features. + Training sample, where `n_samples` is the number of samples + and `n_features` is the number of features. y : array-like, shape (n_samples,) Target values. @@ -251,15 +220,18 @@ def fit(self, X, y): return self def _joint_log_likelihood(self, X): - """Calculate the posterior log probability of the samples X + """Calculate the posterior log probability of sample X Parameters ---------- - X : ndarray - + X : array-like, shape (n_samples, n_features) + Training sample, where `n_samples` is the number of samples + and `n_features` is the number of features. + Returns ------- jll : ndarray, shape (1, n_classes) + Posterior log probability. Raises ------ @@ -273,7 +245,8 @@ def _joint_log_likelihood(self, X): # Obtain the log priors from each fitted estimator all_log_priors = [ nb_model.class_log_prior_ - if hasattr(nb_model, 'class_log_prior_') else np.log(nb_model.class_prior_) + if hasattr(nb_model, 'class_log_prior_') + else np.log(nb_model.class_prior_) for _, nb_model, _ in self.models_] # Ensure class log priors are the same for all estimators @@ -302,6 +275,12 @@ def _joint_log_likelihood(self, X): @property def _models(self): + """ + Internal list of models only containing the name and + estimator, dropping the columns. This is for the implementation + of get_params via BaseComposition._get_params which expects lists + of tuples of length 2. + """ return [(name, model) for name, model, _ in self.models] @_models.setter @@ -327,7 +306,6 @@ def _validate_models(self, X): all_fit_priors = [] all_class_priors = [] - # Check type of `models` parameter if not isinstance(self.models, list): raise TypeError( "Expected list but got {}".format(type(self.models))) @@ -362,34 +340,40 @@ def _validate_models(self, X): "Distributions should be one of {}".format(valid_modules)) # Check naive bayes estimator for attributes - # `prior` and `fit_prior` - class_prior = getattr(estimator, "priors", None) or getattr(estimator, "class_prior", None) + # like `priors`, `class_prior` and `fit_prior` + class_prior = getattr(estimator, "priors", None) or \ + getattr(estimator, "class_prior", None) fit_prior = getattr(estimator, "fit_prior", True) all_class_priors.append(class_prior) all_fit_priors.append(fit_prior) - # Check the columns for duplicate models - # and convert to list if callable + # Check the columns for duplicate models and + # convert to feature if callable if callable(cols): cols = cols(X) for col in cols: if col in dict_col2model: - raise ValueError("Duplicate specification of column found.") + raise ValueError("Duplicate specification of " + "column found.") else: dict_col2model[col] = estimator.__class__.__name__.lower() self._cols.append(cols) # Check if class priors are the same throughout all estimators - if len(set(all_class_priors)) != 1: - raise ValueError("The parameters 'class_prior' or 'prior' " - "must be the same values throughout all estimators " - "if specified.") - - # FIXME really? - # Check if class priors are the same throughout all estimators - if len(set(all_fit_priors)) != 1: - raise ValueError("The parameter 'fit_prior' " - "must be the same values through out all estimators " + if not all(prior is None for prior in all_class_priors): + raise ValueError("The parameters 'class_prior' or 'priors' " + "must be the same values throughout all " + "estimators if specified.") + if all(prior is not None for prior in all_class_priors): + if np.max(np.ptp(all_class_priors, axis=0)) < 1e-6: + raise ValueError("The parameters 'class_prior' or 'priors' " + "must be the same values throughout all " + "estimators if specified.") + + # Check if `fit_prior`s are the same throughout all estimators + if not all(all_fit_priors): + raise ValueError("The parameter 'fit_prior' must be " + "the same values through out all estimators " "if specified.") n_features = X.shape[-1] @@ -398,31 +382,32 @@ def _validate_models(self, X): raise ValueError("Expected {} columns".format(n_features) + " in X but {} were specified.".format(n_cols)) self.n_features_ = n_features - + def _check_X_y(self, X, y): - # Delay further checks on X y to the respective estimators + # Delay any further checks on X and y to the respective estimators + # Only checks if X is pandas dataframe if hasattr(X, "columns"): self._df_cols = X.columns def _check_X(self, X): - # Check pandas.DataFrame + # Check if X should be a pandas dataframe if self._df_cols is not None: - + if not hasattr(X, "columns"): raise TypeError("X should be a dataframe") - + if not all(self._df_cols == X.columns): raise ValueError("Column names must match with " - "column names of fitted data.") + "column names of fitted data.") return X - + @property def named_models_(self): """Access the fitted estimators by name. Read-only attribute to access any distribution by given name. - Keys are transformer names and values are the fitted transformer + Keys are model names and values are the fitted estimator objects. """ @@ -431,7 +416,6 @@ def named_models_(self): in self.models}) - class GaussianNB(_BaseNB): """ Gaussian Naive Bayes (GaussianNB) From a910ed6f0fb9bb38824c178d76965d91cac794a6 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 23 Feb 2020 22:33:49 +0800 Subject: [PATCH 33/61] Updated docs --- doc/modules/naive_bayes.rst | 91 +++++++++++++++---------------------- sklearn/naive_bayes.py | 51 +++++++++++---------- 2 files changed, 64 insertions(+), 78 deletions(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index 40e554aabb7ac..5113a4c7a2f2e 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -264,100 +264,83 @@ of feature :math:`i`. General Naive Bayes ------------------- -:class:`GeneralNB` implements multiple naive Bayes algorithms across the -features in the dataset. Unlike other naive Bayes algorithms in this module, -:class:`GeneralNB` can assume different distributions for different features. -The naive Bayes algorithms correspond to the probability distributions -that the respective features are assumed to follow. Bayes' theorem states -the following -relationship, given class variable :math:`y` and dependent feature -vector :math:`x_1` through :math:`x_n`, : - -.. math:: - - P(y \mid x_1, \dots, x_n) = \frac{P(y) P(x_1, \dots x_n \mid y)} - {P(x_1, \dots, x_n)} - -A use case for this metaestimator is when the dataset consists of obvious -distributions like the categorical and Gaussian distributions. We assume -that the features are independent of each other and some features follow -the categorical distributions, while the rest follow Gaussian. +:class:`GeneralNB` implements multiple naive Bayes models across the +features in the dataset by assuming different distributions for different +features, while maintaining conditional independence between every pair of +features given the value of a class variable. -The posterior distribution is the same, except that the features come from -the different distributions: +A practical use for this metaestimator is when encountering data with +both numerical and categorical features. For example, if we assume that +feature 1 (numerical) follow the Gaussian distribution +and feature 2 (categorical) follows categorical, i.e., .. math:: - p(y|x_1, x_2, x_3) = p(y|x_1) p(y|x_2) p(y|x_3) - -where + X_1 \mid y \sim \text{Normal}(\mu,\sigma^2) .. math:: - X_1 ~ Categorical(p) \\ - X_2 ~ N(0,1) \\ - X_3 ~ Categorical(4) + X_2 \mid y \sim \text{Categorical}(\text{p}) -Specifying the different naive Bayes models is similar to that of -ColumnTransformer - you specify a name, the naive Bayes model and the -column indices or names of the features that follow a distribution -corresponding to this model. +and conditional independence between the two features given :math:`y`, +we can use GeneralNB to model this data. -As :class:`GeneralNB` is a metaestimator, it follows that the data -requirements of each feature depend on the requirements imposed -by the naive Bayes estimators specified. - -GeneralNB is particularly useful for datasets +Here is a code example involving a toy dataset with 5 features. +We first import the relevant libraries: >>> import numpy as np >>> import pandas as pd >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB +The data has 5 features - the first 3 are numerical, the last 2 categorical: + >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], >>> [2.7, 3.8, 2.3, 1, 0], >>> [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) >>> X_test = np.array([[1.5, 2.3, 5.7, 0, 1]]) +Specifying the different naive Bayes models is similar to that of +:class:`ColumnTransformer` or :class:`Pipeline` - you specify +the name, the naive Bayes model and the corresponding columns. +See :mod:`naive_bayes` for a list of our naive Bayes models. + >>> clf = GeneralNB([ >>> ("gaussian", GaussianNB(), [0, 1, 2]), - >>> ("bernoulli", BernoulliNB(), [3, 4]) + >>> ("categorical", CategoricalNB(), [3, 4]) >>> ]) >>> clf.fit(X, y) >>> print(clf.predict(X_test)) [1] - >>> print(clf.score([[2.7, 3.8, 1, 0, 1]],[0])) - 1.0 - -You can also specify the column names of a pandas DataFrame - >>> df = pd.DataFrame(X) - >>> df.columns = ["a", "b", "c", "d", "e"] - >>> df["y"] = [1, 0, 0] - >>> df_test = pd.DataFrame(X_test) - >>> df_test.columns = ["a", "b", "c", "d", "e"] +You can also specify the column names of a pandas DataFrame: + >>> X = pd.DataFrame(X) + >>> X.columns = ["a", "b", "c", "d", "e"] + >>> y = pd.DataFrame(X) + >>> >>> clf = GeneralNB([ >>> ("gaussian", GaussianNB(), ["a", "b", "c"]), - >>> ("bernoulli", BernoulliNB(), ["d", "e"]) + >>> ("categorical", CategoricalNB(), ["d", "e"]) >>> ]) - >>> clf.fit(df.iloc[:,:-1], df["y"]) - >>> print(clf.predict(df_test)) - [1] + >>> clf.fit(X, y) -You may also select columns from `make_column_selector`. +Alternatively, you may also select DataFrame columns using +:func:`sklearn.compose.make_column_selector`. >>> from sklearn.compose import make_column_selector >>> clf = GeneralNB([ >>> ("gaussian", GaussianNB(), make_column_selector(pattern=r"[abc]")), - >>> ("bernoulli", BernoulliNB(), make_column_selector(pattern=r"[de]")) + >>> ("categorical", CategoricalNB(), make_column_selector(pattern=r"[de]")) >>> ]) - >>> clf.fit(df.iloc[:,:-1], df["y"]) + >>> clf.fit(X, y) >>> print(clf.predict(df_test)) [1] -You can also access the attributes and methods of the fitted estimators using -the `named_models_` computed property. From the previous example, +Access the attributes and methods of the fitted estimators using +the `named_models_` computed property and the identifiers in the `self.model` +parameters. Below we obtain the parameters of the fitted Bernoulli distribution +using `bernoulli`. >>> clf.named_models_.bernoulli.var_smoothing 1e-09 diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index c017f04f283ad..27117cf2897f9 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -121,48 +121,50 @@ def predict_proba(self, X): class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): """Naive Bayes metaclassifier for multiple naive Bayes models - The General Naive Bayes classifier is a metaestimator that - allows for multiple distributional assumptions on the features - of the data like the Bernoulli, Gaussian, multinomial, and - categorical distributions. - - This is made possible by composing a joint probability distribution - as the product of independent models or probability distributions. - Each constituent distribution is fitted on a subset of features. + The General Naive Bayes classifier is a metaestimator that allows + different features in the data to be modeled with different naive Bayes + models. This is useful for data containing numerical and categorical + features, where numerical features may be modeled as Gaussian distributions + and categorical features as categorical distributions. Read more in the :ref:`User Guide `. Parameters ---------- models : list of tuples - List of (name, naive bayes estimator, column(s)) tuples specifying the - assumptions of distribution on the features to apply - naive Bayes on subsets of the data. + List of (name, naive Bayes model, column(s)) tuples specifying the + naive Bayes models to apply on the corresponding columns. + This is similar to :class:`Pipeline` or :class:`ColumnTransformer`. name : string - Like in Pipeline and ColumnTransformer, this allows the - distribution and its parameters to be set using ``set_params``. + This is a user-defined identifier that allows the models and its + parameters to be retrieved and set. naive Bayes model : Estimator - Estimator must support :term:`fit`, :term:`predict` - and :term:`_joint_log_likelihood`. - column(s) : array-like of string or int, slicec - Indexes the data on its second axis. Integers are interpreted as - positional columns, while strings can reference DataFrame columns - by name. A scalar string or int should be used where - ``transformer`` expects X to be a 1d array-like (vector), - otherwise a 2d array will be passed to the transformer. + The naive Bayes model represents the distribution assumption on + the features. Use our naive Bayes estimators like + :ref:`` and + :ref:``. Custom estimators must support + :term:`fit`, :term:`predict` and `_joint_log_likelihood`. + column(s) : array-like of {string or int}, slice, or callable + Features that correspond to the naive Bayes models. Indexes + the data on its second axis. Integers are interpreted as + positional columns, while strings reference DataFrame columns + by name. A callable is passed the input data `X` and can return + any of the above. For example, + :func:`sklearn.compose.make_column_selector` can select multiple + columns by name or dtype. Attributes ---------- models_ : list of tuples - Verified list of (name, naive bayes estimator, column(s)) + Verified list of (name, naive Bayes model, column(s)), based on `self.models`. classes_ : ndarray of shape (n_classes,) Class labels known to the classifier. n_features_ : int - Number of features of each sample. + Number of features in each sample. Examples -------- @@ -191,7 +193,7 @@ def __init__(self, models): self._is_fitted = False def fit(self, X, y): - """Fit X and y to the specified naive Bayes estimators. + """Fit X and y to the naive Bayes estimators. Parameters ---------- @@ -290,6 +292,7 @@ def _models(self, value): in zip(value, self.models)] def get_params(self, deep=True): + """Get parameters for this metaestimator""" return self._get_params('_models', deep=deep) def set_params(self, **kwargs): From f60a8c2d1a15df9e53fb298c5f84eea6e8ad38ca Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 24 Feb 2020 22:06:39 +0800 Subject: [PATCH 34/61] Update docs --- doc/modules/naive_bayes.rst | 67 ++++++++++++++++++++----------------- sklearn/naive_bayes.py | 21 +++++++----- 2 files changed, 49 insertions(+), 39 deletions(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index 5113a4c7a2f2e..0d659a528f826 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -270,40 +270,41 @@ features, while maintaining conditional independence between every pair of features given the value of a class variable. A practical use for this metaestimator is when encountering data with -both numerical and categorical features. For example, if we assume that -feature 1 (numerical) follow the Gaussian distribution -and feature 2 (categorical) follows categorical, i.e., +both numerical and categorical features. For example, suppose our data +had 5 features where the first three are numerical and the rest categorical. +We then proceed to assume that numerical features follow the Gaussian +distribution and the categorical features follow the categorical +distribution, i.e., .. math:: - X_1 \mid y \sim \text{Normal}(\mu,\sigma^2) + X_1 \mid y \sim \text{Normal}(\mu_1,\sigma_1^2) \\ + X_2 \mid y \sim \text{Normal}(\mu_2,\sigma_2^2) \\ + X_3 \mid y \sim \text{Normal}(\mu_3,\sigma_3^2) \\ + X_4 \mid y \sim \text{Categorical}(\textbf{p}_4) \\ + X_5 \mid y \sim \text{Categorical}(\textbf{p}_5) -.. math:: - - X_2 \mid y \sim \text{Categorical}(\text{p}) - -and conditional independence between the two features given :math:`y`, -we can use GeneralNB to model this data. - -Here is a code example involving a toy dataset with 5 features. -We first import the relevant libraries: +Let's see how we `GeneralNB` is used with this toy dataset. We first import +the libraries and prepare the data: >>> import numpy as np >>> import pandas as pd - >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, BernoulliNB - -The data has 5 features - the first 3 are numerical, the last 2 categorical: - + >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, CategoricalNB + >>> >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], >>> [2.7, 3.8, 2.3, 1, 0], >>> [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) >>> X_test = np.array([[1.5, 2.3, 5.7, 0, 1]]) -Specifying the different naive Bayes models is similar to that of -:class:`ColumnTransformer` or :class:`Pipeline` - you specify -the name, the naive Bayes model and the corresponding columns. -See :mod:`naive_bayes` for a list of our naive Bayes models. +In the `GeneralNB` constructor, +define a name (for easy access of the fitted estimators later) +and the corresponding columns for every naive Bayes model. +Below we defined two tuples, one for the `GaussianNB()` and +one for the `CategoricalNB()` model. +This manner of specification is similar to that of *transformers* in +:class:`ColumnTransformer ` or +*pipeline* in :class:`Pipeline `. >>> clf = GeneralNB([ >>> ("gaussian", GaussianNB(), [0, 1, 2]), @@ -313,12 +314,13 @@ See :mod:`naive_bayes` for a list of our naive Bayes models. >>> print(clf.predict(X_test)) [1] -You can also specify the column names of a pandas DataFrame: +Besides specifying a list of ints, you can also indicate column +names explicitly if the `X` and `y` data are pandas `DataFrame`s: >>> X = pd.DataFrame(X) >>> X.columns = ["a", "b", "c", "d", "e"] >>> y = pd.DataFrame(X) - >>> + ... >>> clf = GeneralNB([ >>> ("gaussian", GaussianNB(), ["a", "b", "c"]), >>> ("categorical", CategoricalNB(), ["d", "e"]) @@ -326,7 +328,7 @@ You can also specify the column names of a pandas DataFrame: >>> clf.fit(X, y) Alternatively, you may also select DataFrame columns using -:func:`sklearn.compose.make_column_selector`. +:func:`sklearn.compose.make_column_selector`: >>> from sklearn.compose import make_column_selector >>> clf = GeneralNB([ @@ -337,13 +339,18 @@ Alternatively, you may also select DataFrame columns using >>> print(clf.predict(df_test)) [1] -Access the attributes and methods of the fitted estimators using -the `named_models_` computed property and the identifiers in the `self.model` -parameters. Below we obtain the parameters of the fitted Bernoulli distribution -using `bernoulli`. +Finally, you can access the attributes of the fitted estimators using +the :meth:`named_models_ ` +property and the previously defined names. +Below we obtain the `class_count_` attribute from the fitted +categorical distribution, where `"categorical"` comes from the previously +defined `model` parameter in the constructor. + + >>> clf.named_models_.categorical.class_count_ + array([2., 1.]) - >>> clf.named_models_.bernoulli.var_smoothing - 1e-09 +Apart from these two naive Bayes models, you may also use other combinations +of naive Bayes models found on this page to fit your dataset. Out-of-core naive Bayes model fitting ------------------------------------- diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 27117cf2897f9..5b78ef0a0fad3 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -134,25 +134,28 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): models : list of tuples List of (name, naive Bayes model, column(s)) tuples specifying the naive Bayes models to apply on the corresponding columns. - This is similar to :class:`Pipeline` or :class:`ColumnTransformer`. + This is similar to :class:`Pipeline ` + or :class:`ColumnTransformer `. name : string This is a user-defined identifier that allows the models and its - parameters to be retrieved and set. + parameters to be retrieved and set later using :meth:`get_params_` + and :meth:`set_params_`. naive Bayes model : Estimator The naive Bayes model represents the distribution assumption on the features. Use our naive Bayes estimators like - :ref:`` and - :ref:``. Custom estimators must support - :term:`fit`, :term:`predict` and `_joint_log_likelihood`. + :class:`GaussianNB ` and + :class:`CategoricalNB `. + Custom estimators must support :term:`fit`, :term:`predict` + and `_joint_log_likelihood`. column(s) : array-like of {string or int}, slice, or callable Features that correspond to the naive Bayes models. Indexes the data on its second axis. Integers are interpreted as positional columns, while strings reference DataFrame columns - by name. A callable is passed the input data `X` and can return - any of the above. For example, - :func:`sklearn.compose.make_column_selector` can select multiple - columns by name or dtype. + by name. A callable is passed the input data `X` and must return + one of the above. For example, + :func:`compose.make_column_selector ` + can select multiple columns by name or dtype. Attributes ---------- From 024724f0676109710aabcc95756977e630a2857b Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 24 Feb 2020 22:44:38 +0800 Subject: [PATCH 35/61] Minor housekeeping --- sklearn/naive_bayes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 5b78ef0a0fad3..3f5fe5668ce4b 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -265,7 +265,7 @@ def _joint_log_likelihood(self, X): # Obtain the jll of each fitted estimator jlls = [nb_model._joint_log_likelihood( np.array(_safe_indexing(X, cols, axis=1))) - for (_, nb_model, cols) in self.models_] + for _, nb_model, cols in self.models_] # Stack these jlls to give us # the shape (estimator, sample, class) From c8254fe4a6bdd5bf63d878e897083fbc747e70ae Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 24 Feb 2020 22:44:49 +0800 Subject: [PATCH 36/61] Update tests --- sklearn/tests/test_naive_bayes.py | 44 +++++++++++++++++++------------ 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index c368fd457bcf8..1f29e3a76d1f5 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -41,7 +41,7 @@ def test_generalnb_models_wrong_values(): """Test if wrong specification of models raises ValueError""" - + # Tuple has length less than 3 clf = GeneralNB([ ("gaussian", GaussianNB()), @@ -66,12 +66,13 @@ def test_generalnb_models_wrong_values(): def test_generalnb_models_wrong_type(): """Test if wrong specification of models raises TypeError""" + clf = GeneralNB(( ("gaussian", GaussianNB(), [0]), ("bernoulli", BernoulliNB(), [1]) )) assert_raises(TypeError, clf.fit, X, y) - + clf = GeneralNB(( ["gaussian", GaussianNB(), [0]], ["bernoulli", BernoulliNB(), [1]] @@ -105,50 +106,59 @@ def test_generalnb_models_too_few_cols(): assert_raises(ValueError, clf.fit, X, y) - -# FIXME def test_generalnb_joint_log_likelihood(): """Test whether joint log likelihood has been computed correctly""" + + # Get jll from GeneralNB clf = GeneralNB([ - ("gaussian", GaussianNB(), [0]), + ("gaussian", GaussianNB(), [0]), ("bernoulli", BernoulliNB(), [1])] ) - assert_raises(ValueError, clf.fit, X, y) + clf.fit(X, y) + jll = clf._joint_log_likelihood(X) - # Get jll from Gaussian + # Get jll from GaussianNB clf_gnb = GaussianNB() - clf_gnb.fit(X[:,0,None], y) - jll_gnb = clf_gnb._joint_log_likelihood(X[:,1,None]) + clf_gnb.fit(X[:, 0, None], y) + jll_gnb = clf_gnb._joint_log_likelihood(X[:, 0, None]) clp_gnb = np.log(clf_gnb.class_prior_) - # Get jll from Bernoulli + # Get jll from BernoulliNB clf_bnl = BernoulliNB() - clf_bnl.fit(X[:,1,None], y) - jll_bnl = clf_bnl._joint_log_likelihood(X[:,1,None]) + clf_bnl.fit(X[:, 1, None], y) + jll_bnl = clf_bnl._joint_log_likelihood(X[:, 1, None]) clp_bnl = clf_bnl.class_log_prior_ - (jll_gnb - clp_gnb + jll_bnl - clp_bnl) + clp_bnl - + expected_jll = (jll_gnb - clp_gnb + jll_bnl - clp_bnl) + clp_bnl + assert_array_almost_equal(jll, expected_jll) def test_generalnb_models_duplicate(): """Test if specifying duplicate columns in models raises error""" clf = GeneralNB([ - ("gaussian1", GaussianNB(), [0, 1]), + ("gaussian1", GaussianNB(), [0, 1]), ("gaussian2", GaussianNB(), [1])] ) assert_raises(ValueError, clf.fit, X, y) def test_generalnb_different_class_priors(): + """ + Test if specifying different priors across naive + Bayes models will raise error + """ clf = GeneralNB([ - ("bernoulli1", BernoulliNB(class_prior=[0.5,0.5]), [0]), - ("bernoulli2", BernoulliNB(class_prior=[0.8,0.2]), [1]) + ("bernoulli1", BernoulliNB(class_prior=[0.5, 0.5]), [0]), + ("bernoulli2", BernoulliNB(class_prior=[0.8, 0.2]), [1]) ]) assert_raises(ValueError, clf.fit, X, y) def test_generalnb_different_fit_priors(): + """ + Test if specifying different fit_priors across naive + Bayes models will raise error + """ clf = GeneralNB([ ("gaussian", GaussianNB(), [0]), ("bernoulli", BernoulliNB(fit_prior=False), [1]) From d7817cb1041c2522d48fcf912c4782c8c98c8eeb Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 25 Feb 2020 19:44:05 +0800 Subject: [PATCH 37/61] Perform _check_X before calculating likelihood --- sklearn/naive_bayes.py | 46 ++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 3f5fe5668ce4b..ee2c68ae16cee 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -123,7 +123,7 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): The General Naive Bayes classifier is a metaestimator that allows different features in the data to be modeled with different naive Bayes - models. This is useful for data containing numerical and categorical + models. This is useful for data containing numerical and categorical features, where numerical features may be modeled as Gaussian distributions and categorical features as categorical distributions. @@ -138,23 +138,24 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): or :class:`ColumnTransformer `. name : string - This is a user-defined identifier that allows the models and its + This is a user-defined identifier that allows the models and its parameters to be retrieved and set later using :meth:`get_params_` and :meth:`set_params_`. naive Bayes model : Estimator - The naive Bayes model represents the distribution assumption on - the features. Use our naive Bayes estimators like + The naive Bayes model represents the distribution assumption on + the features. Use our naive Bayes estimators like :class:`GaussianNB ` and - :class:`CategoricalNB `. + :class:`CategoricalNB `. Custom estimators must support :term:`fit`, :term:`predict` and `_joint_log_likelihood`. column(s) : array-like of {string or int}, slice, or callable - Features that correspond to the naive Bayes models. Indexes - the data on its second axis. Integers are interpreted as - positional columns, while strings reference DataFrame columns + Features that correspond to the naive Bayes models. Indexes + the data on its second axis. Integers are interpreted as + positional columns, while strings reference DataFrame columns by name. A callable is passed the input data `X` and must return - one of the above. For example, - :func:`compose.make_column_selector ` + one of the above. For example, + :func:`compose.make_column_selector + ` can select multiple columns by name or dtype. Attributes @@ -172,18 +173,22 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): Examples -------- >>> import numpy as np - >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, Categorical + >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, CategoricalNB >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], ... [2.7, 3.8, 2.3, 1, 0], ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) - >>> clf = GeneralNB([ - ... ("gaussian", GaussianNB(), [0, 1, 2]), - ... ("categorical", CategoricalNB(), [3, 4]) - >>> ]) + >>> clf = GeneralNB( + ... [("gaussian", GaussianNB(), [0, 1, 2]), + ... ("categorical", CategoricalNB(), [3, 4])]) >>> clf.fit(X, y) - >>> clf.predict(X[0]) - [1] + GeneralNB(models=[('gaussian', GaussianNB(), + [0, 1, 2]), + ('categorical', + CategoricalNB(), + [3, 4])]) + >>> clf.predict(X[:1,]) + array([1]) """ def __init__(self, models): @@ -264,7 +269,8 @@ def _joint_log_likelihood(self, X): # Obtain the jll of each fitted estimator jlls = [nb_model._joint_log_likelihood( - np.array(_safe_indexing(X, cols, axis=1))) + np.array(_safe_indexing( + nb_model._check_X(X), cols, axis=1))) for _, nb_model, cols in self.models_] # Stack these jlls to give us @@ -373,8 +379,8 @@ def _validate_models(self, X): if all(prior is not None for prior in all_class_priors): if np.max(np.ptp(all_class_priors, axis=0)) < 1e-6: raise ValueError("The parameters 'class_prior' or 'priors' " - "must be the same values throughout all " - "estimators if specified.") + "must be the same values throughout all " + "estimators if specified.") # Check if `fit_prior`s are the same throughout all estimators if not all(all_fit_priors): From 1d991c3a660ad989388c9b61706214a80a7489cb Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 25 Feb 2020 21:08:07 +0800 Subject: [PATCH 38/61] Cast to DataFrame --- sklearn/naive_bayes.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index ee2c68ae16cee..f7e064a4f6b7e 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -20,6 +20,7 @@ from abc import ABCMeta, abstractmethod import copy import numpy as np +import pandas as pd from .base import BaseEstimator, ClassifierMixin from .exceptions import NotFittedError @@ -182,11 +183,8 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): ... [("gaussian", GaussianNB(), [0, 1, 2]), ... ("categorical", CategoricalNB(), [3, 4])]) >>> clf.fit(X, y) - GeneralNB(models=[('gaussian', GaussianNB(), - [0, 1, 2]), - ('categorical', - CategoricalNB(), - [3, 4])]) + GeneralNB(models=[('gaussian', GaussianNB(...), [0, 1, 2]), + ('categorical', CategoricalNB(...), [3, 4])]) >>> clf.predict(X[:1,]) array([1]) """ @@ -268,10 +266,15 @@ def _joint_log_likelihood(self, X): "must be the same.") # Obtain the jll of each fitted estimator - jlls = [nb_model._joint_log_likelihood( - np.array(_safe_indexing( - nb_model._check_X(X), cols, axis=1))) - for _, nb_model, cols in self.models_] + jlls = [] + for _, nb_model, cols in self.models_: + X_ = nb_model._check_X(X) + # If X is DataFrame, cast X_ back to DataFrame + if self._df_cols is not None: + X_ = pd.DataFrame(X_, columns=self._df_cols) + X_ = np.array(_safe_indexing(X_, cols, axis=1)) + jll = nb_model._joint_log_likelihood(X_) + jlls.append(jll) # Stack these jlls to give us # the shape (estimator, sample, class) @@ -402,7 +405,7 @@ def _check_X_y(self, X, y): self._df_cols = X.columns def _check_X(self, X): - # Check if X should be a pandas dataframe + # Check if X is a pandas dataframe if self._df_cols is not None: if not hasattr(X, "columns"): From 13dc706ee81be2cfa020d71bb5e8d235c8c1fca1 Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 25 Feb 2020 21:08:23 +0800 Subject: [PATCH 39/61] Fix formatting --- doc/modules/naive_bayes.rst | 41 ++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index 0d659a528f826..8a4f871e06eaf 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -292,8 +292,8 @@ the libraries and prepare the data: >>> from sklearn.naive_bayes import GeneralNB, GaussianNB, CategoricalNB >>> >>> X = np.array([[1.5, 2.3, 5.7, 0, 1], - >>> [2.7, 3.8, 2.3, 1, 0], - >>> [1.7, 0.1, 4.5, 1, 0]]) + ... [2.7, 3.8, 2.3, 1, 0], + ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) >>> X_test = np.array([[1.5, 2.3, 5.7, 0, 1]]) @@ -307,36 +307,45 @@ This manner of specification is similar to that of *transformers* in *pipeline* in :class:`Pipeline `. >>> clf = GeneralNB([ - >>> ("gaussian", GaussianNB(), [0, 1, 2]), - >>> ("categorical", CategoricalNB(), [3, 4]) - >>> ]) + ... ("gaussian", GaussianNB(), [0, 1, 2]), + ... ("categorical", CategoricalNB(), [3, 4]) + ... ]) >>> clf.fit(X, y) + GeneralNB(models=[('gaussian', GaussianNB(...), [0, 1, 2]), + ('categorical', CategoricalNB(...), [3, 4])]) >>> print(clf.predict(X_test)) [1] -Besides specifying a list of ints, you can also indicate column +Besides specifying a list of integers, you can also indicate columns names explicitly if the `X` and `y` data are pandas `DataFrame`s: >>> X = pd.DataFrame(X) >>> X.columns = ["a", "b", "c", "d", "e"] - >>> y = pd.DataFrame(X) - ... + >>> y = pd.DataFrame(y) + >>> >>> clf = GeneralNB([ - >>> ("gaussian", GaussianNB(), ["a", "b", "c"]), - >>> ("categorical", CategoricalNB(), ["d", "e"]) - >>> ]) + ... ("gaussian", GaussianNB(), ["a", "b", "c"]), + ... ("categorical", CategoricalNB(), ["d", "e"]) + ... ]) >>> clf.fit(X, y) + GeneralNB(models=[('gaussian', GaussianNB(...), ['a', 'b', 'c']), + ('categorical', CategoricalNB(...), ['d', 'e'])]) Alternatively, you may also select DataFrame columns using -:func:`sklearn.compose.make_column_selector`: +:func:`sklearn.compose.make_column_selector` as follows. Note that +X and y must be DataFrames. >>> from sklearn.compose import make_column_selector >>> clf = GeneralNB([ - >>> ("gaussian", GaussianNB(), make_column_selector(pattern=r"[abc]")), - >>> ("categorical", CategoricalNB(), make_column_selector(pattern=r"[de]")) - >>> ]) + ... ("gaussian", GaussianNB(), + ... make_column_selector(pattern=r"[abc]")), + ... ("categorical", CategoricalNB(), + ... make_column_selector(pattern=r"[de]")) + ... ]) >>> clf.fit(X, y) - >>> print(clf.predict(df_test)) + GeneralNB(models=[('gaussian', GaussianNB(...), ...), + ('categorical', CategoricalNB(...), ...)]) + >>> print(clf.predict(X.iloc[:1,])) [1] Finally, you can access the attributes of the fitted estimators using From f174bcf4989550a0a06c50c8139cf5223c63790e Mon Sep 17 00:00:00 2001 From: raibosome Date: Tue, 25 Feb 2020 21:44:55 +0800 Subject: [PATCH 40/61] Update test --- sklearn/tests/test_naive_bayes.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 1f29e3a76d1f5..1d862be5404e0 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -109,13 +109,8 @@ def test_generalnb_models_too_few_cols(): def test_generalnb_joint_log_likelihood(): """Test whether joint log likelihood has been computed correctly""" - # Get jll from GeneralNB - clf = GeneralNB([ - ("gaussian", GaussianNB(), [0]), - ("bernoulli", BernoulliNB(), [1])] - ) - clf.fit(X, y) - jll = clf._joint_log_likelihood(X) + X = np.array([[-2, 0], [-1, 1], [-1, 0], [1, 1], [1, 0], [2, 1]]) + y = np.array([1, 1, 1, 2, 2, 2]) # Get jll from GaussianNB clf_gnb = GaussianNB() @@ -129,6 +124,14 @@ def test_generalnb_joint_log_likelihood(): jll_bnl = clf_bnl._joint_log_likelihood(X[:, 1, None]) clp_bnl = clf_bnl.class_log_prior_ + # Get jll from GeneralNB + clf = GeneralNB([ + ("gaussian", GaussianNB(), [0]), + ("bernoulli", BernoulliNB(), [1])] + ) + clf.fit(X, y) + jll = clf._joint_log_likelihood(X) + expected_jll = (jll_gnb - clp_gnb + jll_bnl - clp_bnl) + clp_bnl assert_array_almost_equal(jll, expected_jll) From 6fa37b1f47862e9f5d671b90a15fade3ad6cfb22 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sat, 7 Mar 2020 22:48:06 +0800 Subject: [PATCH 41/61] Added FIXMEs --- sklearn/naive_bayes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index f7e064a4f6b7e..7f34b9e797038 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -265,14 +265,16 @@ def _joint_log_likelihood(self, X): raise ValueError("Class priors for every estimator " "must be the same.") + # FIXME # Obtain the jll of each fitted estimator jlls = [] for _, nb_model, cols in self.models_: - X_ = nb_model._check_X(X) + X_ = X.copy() # If X is DataFrame, cast X_ back to DataFrame if self._df_cols is not None: X_ = pd.DataFrame(X_, columns=self._df_cols) X_ = np.array(_safe_indexing(X_, cols, axis=1)) + X_ = nb_model._check_X(X_) jll = nb_model._joint_log_likelihood(X_) jlls.append(jll) @@ -369,7 +371,7 @@ def _validate_models(self, X): for col in cols: if col in dict_col2model: raise ValueError("Duplicate specification of " - "column found.") + f"column {col} found.") else: dict_col2model[col] = estimator.__class__.__name__.lower() self._cols.append(cols) From 0300f3da88fe33315e4663651bb22ad8acdb9e43 Mon Sep 17 00:00:00 2001 From: raibosome Date: Wed, 18 Mar 2020 23:57:24 +0800 Subject: [PATCH 42/61] Trigger CI/CD --- doc/modules/naive_bayes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index 8a4f871e06eaf..e310387c5d23d 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -316,7 +316,7 @@ This manner of specification is similar to that of *transformers* in >>> print(clf.predict(X_test)) [1] -Besides specifying a list of integers, you can also indicate columns +Besides specifying a list of integers, you can also indicate column names explicitly if the `X` and `y` data are pandas `DataFrame`s: >>> X = pd.DataFrame(X) From 58b8e31062bac088cd5a12fcb5d758414c9e91dd Mon Sep 17 00:00:00 2001 From: raibosome Date: Thu, 19 Mar 2020 00:03:40 +0800 Subject: [PATCH 43/61] Trigger CI/CD --- doc/modules/naive_bayes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index e310387c5d23d..d8e0e44934ee4 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -316,6 +316,7 @@ This manner of specification is similar to that of *transformers* in >>> print(clf.predict(X_test)) [1] + Besides specifying a list of integers, you can also indicate column names explicitly if the `X` and `y` data are pandas `DataFrame`s: From 1ba9a81eadb0dc3f67f24641d90f34048eeb0d84 Mon Sep 17 00:00:00 2001 From: raibosome Date: Thu, 19 Mar 2020 19:25:55 +0800 Subject: [PATCH 44/61] No need to ensure model is internal --- sklearn/naive_bayes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index adc870efdd903..d7494ef1be324 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -352,9 +352,6 @@ def _validate_models(self, X): raise TypeError("Naive bayes estimator should implement " "the fit and _joint_log_likelihood methods. " "{} doesn't.".format(type(estimator))) - if estimator.__class__.__name__ not in valid_modules: - raise ValueError( - "Distributions should be one of {}".format(valid_modules)) # Check naive bayes estimator for attributes # like `priors`, `class_prior` and `fit_prior` From 2428f368e1b877ce38f72ae64b20f61855d6a071 Mon Sep 17 00:00:00 2001 From: raibosome Date: Thu, 19 Mar 2020 19:30:56 +0800 Subject: [PATCH 45/61] Fix typo --- sklearn/naive_bayes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index d7494ef1be324..1768cd5b67436 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -140,8 +140,8 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): name : string This is a user-defined identifier that allows the models and its - parameters to be retrieved and set later using :meth:`get_params_` - and :meth:`set_params_`. + parameters to be retrieved and set later using :meth:`get_params` + and :meth:`set_params`. naive Bayes model : Estimator The naive Bayes model represents the distribution assumption on the features. Use our naive Bayes estimators like @@ -352,6 +352,9 @@ def _validate_models(self, X): raise TypeError("Naive bayes estimator should implement " "the fit and _joint_log_likelihood methods. " "{} doesn't.".format(type(estimator))) + if estimator.__class__.__name__ not in valid_modules: + raise ValueError( + "Distributions should be one of {}".format(valid_modules)) # Check naive bayes estimator for attributes # like `priors`, `class_prior` and `fit_prior` From d58505ae76ae891cfab915d0b3a6ccff7ce4beea Mon Sep 17 00:00:00 2001 From: raibosome Date: Thu, 19 Mar 2020 19:52:54 +0800 Subject: [PATCH 46/61] Update docs --- doc/modules/naive_bayes.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/modules/naive_bayes.rst b/doc/modules/naive_bayes.rst index d8e0e44934ee4..817317f39d1cc 100644 --- a/doc/modules/naive_bayes.rst +++ b/doc/modules/naive_bayes.rst @@ -303,8 +303,7 @@ and the corresponding columns for every naive Bayes model. Below we defined two tuples, one for the `GaussianNB()` and one for the `CategoricalNB()` model. This manner of specification is similar to that of *transformers* in -:class:`ColumnTransformer ` or -*pipeline* in :class:`Pipeline `. +:class:`ColumnTransformer `. >>> clf = GeneralNB([ ... ("gaussian", GaussianNB(), [0, 1, 2]), From 3d38d7163b7713eabbb13aacce2c1c16317c7701 Mon Sep 17 00:00:00 2001 From: raibosome Date: Thu, 19 Mar 2020 19:53:18 +0800 Subject: [PATCH 47/61] Mention that models_ attribute is fitted --- sklearn/naive_bayes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 1768cd5b67436..02be510e6c863 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -162,7 +162,7 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): Attributes ---------- models_ : list of tuples - Verified list of (name, naive Bayes model, column(s)), + List of (name, fitted naive Bayes model, column(s)), based on `self.models`. classes_ : ndarray of shape (n_classes,) From c3b68a05e8d63948a1ea365dc60e3c77aaeadce0 Mon Sep 17 00:00:00 2001 From: raibosome Date: Fri, 20 Mar 2020 08:28:16 +0800 Subject: [PATCH 48/61] Remove restriction to use sklearn's naive bayes --- sklearn/naive_bayes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 02be510e6c863..f4173a6cc0d74 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -352,9 +352,6 @@ def _validate_models(self, X): raise TypeError("Naive bayes estimator should implement " "the fit and _joint_log_likelihood methods. " "{} doesn't.".format(type(estimator))) - if estimator.__class__.__name__ not in valid_modules: - raise ValueError( - "Distributions should be one of {}".format(valid_modules)) # Check naive bayes estimator for attributes # like `priors`, `class_prior` and `fit_prior` From b854b05947c77442a746b5ad3545d164e4a8b366 Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 4 May 2020 21:04:34 +0800 Subject: [PATCH 49/61] Remove attributes initialised as None --- sklearn/naive_bayes.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index c258574911c3d..53b9605249130 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -194,11 +194,6 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): def __init__(self, models): self.models = models - self.models_ = None - self.classes_ = None - self.n_features_ = None - self._cols = None - self._df_cols = None self._is_fitted = False def fit(self, X, y): From 2c1cf1b1515864e2dc76500784b70da30bda4bba Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 4 May 2020 21:05:57 +0800 Subject: [PATCH 50/61] Add comment for `callable` check --- sklearn/naive_bayes.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 53b9605249130..eb490574300b5 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -341,10 +341,12 @@ def _validate_models(self, X): _, estimator, cols = model - # Check naive bayes estimator for format - # `fit` and `_joint_log_likelihood` attributes + # Check if user specified say `GaussianNB()` instead of `GaussianNB` if callable(estimator): raise ValueError("Estimator should be a callable specified.") + + # Check naive bayes estimator for format + # `fit` and `_joint_log_likelihood` attributes if not (hasattr(estimator, "fit") or hasattr(estimator, "_joint_log_likelihood")): raise TypeError("Naive bayes estimator should implement " From 927d40154348fbc5e80ccb2ea070f780b981ded0 Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 4 May 2020 21:08:27 +0800 Subject: [PATCH 51/61] Move prior checking to fit --- sklearn/naive_bayes.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index eb490574300b5..8c0ba7b96217b 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -221,6 +221,21 @@ def fit(self, X, y): for (name, nb_model, _), cols in zip(self.models, self._cols)] + # Obtain the log priors from each fitted estimator + all_log_priors = [ + nb_model.class_log_prior_ + if hasattr(nb_model, 'class_log_prior_') + else np.log(nb_model.class_prior_) + for _, nb_model, _ in self.models_] + + # Ensure class log priors are the same for all estimators + all_log_priors = np.hstack([all_log_priors]) + if np.max(np.ptp(all_log_priors, axis=0)) < 1e-6: + log_prior = all_log_priors[0] + else: + raise ValueError("Class priors for every estimator " + "must be the same.") + self._is_fitted = True return self @@ -248,21 +263,6 @@ def _joint_log_likelihood(self, X): raise NotFittedError("Call the fit() method first " "before calling predict().") - # Obtain the log priors from each fitted estimator - all_log_priors = [ - nb_model.class_log_prior_ - if hasattr(nb_model, 'class_log_prior_') - else np.log(nb_model.class_prior_) - for _, nb_model, _ in self.models_] - - # Ensure class log priors are the same for all estimators - all_log_priors = np.hstack([all_log_priors]) - if np.max(np.ptp(all_log_priors, axis=0)) < 1e-6: - log_prior = all_log_priors[0] - else: - raise ValueError("Class priors for every estimator " - "must be the same.") - # FIXME # Obtain the jll of each fitted estimator jlls = [] From 1196267317de836ce56dab1fa81407d6d69ba4d1 Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 4 May 2020 21:09:42 +0800 Subject: [PATCH 52/61] Add class_prior and fit_prior to constructor --- sklearn/naive_bayes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 8c0ba7b96217b..05583733512fc 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -192,7 +192,8 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): array([1]) """ - def __init__(self, models): + # FIXME add `class_prior` and `fit_prior` + def __init__(self, models, class_prior, fit_prior): self.models = models self._is_fitted = False From 15b0876e8fb9c47b721ec1a8ba5f6daf62ddd85a Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 10 May 2020 19:49:25 +0800 Subject: [PATCH 53/61] Remove class_prior and fit_prior in _validate_models --- sklearn/naive_bayes.py | 58 ++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 36 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 05583733512fc..c44eb757f22b5 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -30,13 +30,14 @@ from .preprocessing import label_binarize from .utils.extmath import safe_sparse_dot from .utils import check_X_y, check_array, deprecated, Bunch, _safe_indexing -from .utils.fixes import logsumexp +# from .utils.fixes import logsumexp from .utils.validation import _check_sample_weight from .utils.metaestimators import _BaseComposition from .utils.multiclass import _check_partial_fit_first_call from .utils.validation import check_is_fitted, check_non_negative, column_or_1d from .utils.validation import _check_sample_weight from .utils.validation import _deprecate_positional_args +import ipdb __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB', 'CategoricalNB', 'GeneralNB'] @@ -182,8 +183,8 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): ... [2.7, 3.8, 2.3, 1, 0], ... [1.7, 0.1, 4.5, 1, 0]]) >>> y = np.array([1, 0, 0]) - >>> clf = GeneralNB( - ... [("gaussian", GaussianNB(), [0, 1, 2]), + >>> clf = GeneralNB(models=[ + ... ("gaussian", GaussianNB(), [0, 1, 2]), ... ("categorical", CategoricalNB(), [3, 4])]) >>> clf.fit(X, y) GeneralNB(models=[('gaussian', GaussianNB(...), [0, 1, 2]), @@ -192,9 +193,10 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): array([1]) """ - # FIXME add `class_prior` and `fit_prior` - def __init__(self, models, class_prior, fit_prior): + def __init__(self, *, models, fit_prior=False, class_prior=None): self.models = models + self.fit_prior = fit_prior + self.class_prior = class_prior self._is_fitted = False def fit(self, X, y): @@ -213,9 +215,19 @@ def fit(self, X, y): self : object """ self._validate_models(X) - self._check_X_y(X, y) + # self._check_X_y(X, y) + + # self.classes_ = np.unique(y) + + for i in range(len(self.models)): + if hasattr(self.models[i][1], "fit_prior"): + self.models[i][1].fit_prior = self.fit_prior + self.models[i][1].class_prior = self.class_prior + else: + self.models[i][1].priors = self.class_prior - self.classes_ = np.unique(y) + ipdb.set_trace() + self.models_ = [ (name, nb_model.fit(_safe_indexing(X, cols, axis=1), y), cols) @@ -239,7 +251,8 @@ def fit(self, X, y): self._is_fitted = True - return self + def _update_attributes(self): + self.models def _joint_log_likelihood(self, X): """Calculate the posterior log probability of sample X @@ -319,8 +332,6 @@ def _validate_models(self, X): self._cols = [] dict_col2model = {} - all_fit_priors = [] - all_class_priors = [] if not isinstance(self.models, list): raise TypeError( @@ -344,7 +355,7 @@ def _validate_models(self, X): # Check if user specified say `GaussianNB()` instead of `GaussianNB` if callable(estimator): - raise ValueError("Estimator should be a callable specified.") + raise ValueError("Estimator should be a callable.") # Check naive bayes estimator for format # `fit` and `_joint_log_likelihood` attributes @@ -354,14 +365,6 @@ def _validate_models(self, X): "the fit and _joint_log_likelihood methods. " "{} doesn't.".format(type(estimator))) - # Check naive bayes estimator for attributes - # like `priors`, `class_prior` and `fit_prior` - class_prior = getattr(estimator, "priors", None) or \ - getattr(estimator, "class_prior", None) - fit_prior = getattr(estimator, "fit_prior", True) - all_class_priors.append(class_prior) - all_fit_priors.append(fit_prior) - # Check the columns for duplicate models and # convert to feature if callable if callable(cols): @@ -374,23 +377,6 @@ def _validate_models(self, X): dict_col2model[col] = estimator.__class__.__name__.lower() self._cols.append(cols) - # Check if class priors are the same throughout all estimators - if not all(prior is None for prior in all_class_priors): - raise ValueError("The parameters 'class_prior' or 'priors' " - "must be the same values throughout all " - "estimators if specified.") - if all(prior is not None for prior in all_class_priors): - if np.max(np.ptp(all_class_priors, axis=0)) < 1e-6: - raise ValueError("The parameters 'class_prior' or 'priors' " - "must be the same values throughout all " - "estimators if specified.") - - # Check if `fit_prior`s are the same throughout all estimators - if not all(all_fit_priors): - raise ValueError("The parameter 'fit_prior' must be " - "the same values through out all estimators " - "if specified.") - n_features = X.shape[-1] n_cols = len(dict_col2model) if n_cols != n_features: From 3551542eee180259bbd6c4fdc9effa461f2c6f36 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 10 May 2020 19:50:01 +0800 Subject: [PATCH 54/61] Remove unused variables --- sklearn/naive_bayes.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index c44eb757f22b5..570b8dafe0403 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -327,9 +327,6 @@ def set_params(self, **kwargs): def _validate_models(self, X): - valid_modules = copy.copy(__all__) - valid_modules.remove("GeneralNB") - self._cols = [] dict_col2model = {} From e695c29508f0cf24f82df41f68102e6a51d70df9 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 10 May 2020 19:51:53 +0800 Subject: [PATCH 55/61] Remove checking of all_log_priors --- sklearn/naive_bayes.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 570b8dafe0403..1d4c925f05eeb 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -234,26 +234,8 @@ def fit(self, X, y): for (name, nb_model, _), cols in zip(self.models, self._cols)] - # Obtain the log priors from each fitted estimator - all_log_priors = [ - nb_model.class_log_prior_ - if hasattr(nb_model, 'class_log_prior_') - else np.log(nb_model.class_prior_) - for _, nb_model, _ in self.models_] - - # Ensure class log priors are the same for all estimators - all_log_priors = np.hstack([all_log_priors]) - if np.max(np.ptp(all_log_priors, axis=0)) < 1e-6: - log_prior = all_log_priors[0] - else: - raise ValueError("Class priors for every estimator " - "must be the same.") - self._is_fitted = True - def _update_attributes(self): - self.models - def _joint_log_likelihood(self, X): """Calculate the posterior log probability of sample X From d250bed619163e7ef8d6870511497243008e1031 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 10 May 2020 19:59:52 +0800 Subject: [PATCH 56/61] Add comments in fit() method --- sklearn/naive_bayes.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 1d4c925f05eeb..08a3faa691d5a 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -215,10 +215,12 @@ def fit(self, X, y): self : object """ self._validate_models(X) - # self._check_X_y(X, y) - - # self.classes_ = np.unique(y) + self._check_X_y(X, y) + # Apply self.fit_prior and self.class_prior to + # all the models specified by user at index 1. + # Continuous models like GaussianNB do not + # have an attribute equivalent to fit_prior for i in range(len(self.models)): if hasattr(self.models[i][1], "fit_prior"): self.models[i][1].fit_prior = self.fit_prior @@ -226,9 +228,10 @@ def fit(self, X, y): else: self.models[i][1].priors = self.class_prior - ipdb.set_trace() - + self.classes_ = np.unique(y) + # Create an attribute that is a verified version + # of the user-specified self.models self.models_ = [ (name, nb_model.fit(_safe_indexing(X, cols, axis=1), y), cols) for (name, nb_model, _), cols From c6cfeff330deeb42b13589648941b372060218b7 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 10 May 2020 20:29:20 +0800 Subject: [PATCH 57/61] Remove custom NotFittedError exception --- sklearn/naive_bayes.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 08a3faa691d5a..2c93ceeeb4064 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -37,7 +37,7 @@ from .utils.validation import check_is_fitted, check_non_negative, column_or_1d from .utils.validation import _check_sample_weight from .utils.validation import _deprecate_positional_args -import ipdb +# import ipdb __all__ = ['BernoulliNB', 'GaussianNB', 'MultinomialNB', 'ComplementNB', 'CategoricalNB', 'GeneralNB'] @@ -237,8 +237,6 @@ def fit(self, X, y): for (name, nb_model, _), cols in zip(self.models, self._cols)] - self._is_fitted = True - def _joint_log_likelihood(self, X): """Calculate the posterior log probability of sample X @@ -252,15 +250,7 @@ def _joint_log_likelihood(self, X): ------- jll : ndarray, shape (1, n_classes) Posterior log probability. - - Raises - ------ - NotFittedError - If estimators have not been fitted """ - if not self._is_fitted: - raise NotFittedError("Call the fit() method first " - "before calling predict().") # FIXME # Obtain the jll of each fitted estimator From 47f8207f59982213ba04bdd332f79c95cb6dec25 Mon Sep 17 00:00:00 2001 From: raibosome Date: Sun, 10 May 2020 21:01:14 +0800 Subject: [PATCH 58/61] Refactor _validate_models and add comments --- sklearn/naive_bayes.py | 57 ++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 2c93ceeeb4064..544e34d4c4322 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -302,16 +302,41 @@ def set_params(self, **kwargs): def _validate_models(self, X): - self._cols = [] - dict_col2model = {} - + # Check type of self.models if not isinstance(self.models, list): raise TypeError( "Expected list but got {}".format(type(self.models))) + # Check names in self.models names, _, _ = zip(*self.models) self._validate_names(names) + # Check columns in self.models for duplicates + # convert to feature if callable + self._cols = [] + dict_col2model = {} + if callable(cols): + cols = cols(X) + for col in cols: + if col in dict_col2model: + raise ValueError("Duplicate specification of " + f"column {col} found.") + else: + dict_col2model[col] = estimator.__class__.__name__.lower() + self._cols.append(cols) + + # This checks if the no. of columns in the dataset + # matches the columns specified + # TODO: Lift this restriction and use a `remainder` parameter + n_features = X.shape[-1] + n_cols_specified = len(dict_col2model) + if n_cols_specified != n_features: + raise ValueError("Expected {} columns ".format(n_features) + + "in X but {} ".format(n_cols_specified) + + "were specified.") + self.n_features_ = n_features + + # Lastly, check the estimators in self.models for model in self.models: # Check type of each entry in list @@ -323,13 +348,14 @@ def _validate_models(self, X): raise ValueError("Expected tuple to have length of 3 " "but got {}".format(len(model))) - _, estimator, cols = model + _, estimator, _ = model - # Check if user specified say `GaussianNB()` instead of `GaussianNB` + # Check if user specified say `GaussianNB()` instead of + # `GaussianNB` if callable(estimator): raise ValueError("Estimator should be a callable.") - # Check naive bayes estimator for format + # Check naive bayes estimator for # `fit` and `_joint_log_likelihood` attributes if not (hasattr(estimator, "fit") or hasattr(estimator, "_joint_log_likelihood")): @@ -337,25 +363,6 @@ def _validate_models(self, X): "the fit and _joint_log_likelihood methods. " "{} doesn't.".format(type(estimator))) - # Check the columns for duplicate models and - # convert to feature if callable - if callable(cols): - cols = cols(X) - for col in cols: - if col in dict_col2model: - raise ValueError("Duplicate specification of " - f"column {col} found.") - else: - dict_col2model[col] = estimator.__class__.__name__.lower() - self._cols.append(cols) - - n_features = X.shape[-1] - n_cols = len(dict_col2model) - if n_cols != n_features: - raise ValueError("Expected {} columns".format(n_features) + - " in X but {} were specified.".format(n_cols)) - self.n_features_ = n_features - def _check_X_y(self, X, y): # Delay any further checks on X and y to the respective estimators # Only checks if X is pandas dataframe From 5c717ee6236c1ecceac39697a953a1de73ef74a9 Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 11 May 2020 21:14:25 +0800 Subject: [PATCH 59/61] Remove self._is_fitted --- sklearn/naive_bayes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 544e34d4c4322..ef0119abafaa3 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -197,7 +197,6 @@ def __init__(self, *, models, fit_prior=False, class_prior=None): self.models = models self.fit_prior = fit_prior self.class_prior = class_prior - self._is_fitted = False def fit(self, X, y): """Fit X and y to the naive Bayes estimators. From fc2ff64727a32f347fd9951647f42fe260284a87 Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 11 May 2020 21:24:36 +0800 Subject: [PATCH 60/61] Reposition marker for positional args --- sklearn/naive_bayes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index ef0119abafaa3..a795c63b02857 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -193,7 +193,7 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): array([1]) """ - def __init__(self, *, models, fit_prior=False, class_prior=None): + def __init__(self, models, *, fit_prior=False, class_prior=None): self.models = models self.fit_prior = fit_prior self.class_prior = class_prior From 23d8cdfc2f0e1d78a4f986b2c580ac3b77a99fdc Mon Sep 17 00:00:00 2001 From: raibosome Date: Mon, 11 May 2020 21:39:41 +0800 Subject: [PATCH 61/61] Add `remainder` parameter and docstring --- sklearn/naive_bayes.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index a795c63b02857..10f84b46352d6 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -163,6 +163,14 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): ` can select multiple columns by name or dtype. + remainder : Estimator, default=None + By default, only the specified columns in `models` are + used for fitting, and the non-specified columns are dropped + (default of ``None``). + By specifying ``remainder=GaussianNB()`` for example, all remaining + columns that are not specified in `models` will be modelled using + Gaussian Naive Bayes. + Attributes ---------- models_ : list of tuples @@ -193,8 +201,13 @@ class GeneralNB(_BaseNB, _BaseComposition, ClassifierMixin): array([1]) """ - def __init__(self, models, *, fit_prior=False, class_prior=None): + def __init__(self, + models, *, + remainder=None, + fit_prior=False, + class_prior=None): self.models = models + self.remainder = remainder self.fit_prior = fit_prior self.class_prior = class_prior @@ -216,9 +229,9 @@ def fit(self, X, y): self._validate_models(X) self._check_X_y(X, y) - # Apply self.fit_prior and self.class_prior to - # all the models specified by user at index 1. - # Continuous models like GaussianNB do not + # Apply self.fit_prior and self.class_prior to + # all the models specified by user at index 1. + # Continuous models like GaussianNB do not # have an attribute equivalent to fit_prior for i in range(len(self.models)): if hasattr(self.models[i][1], "fit_prior"): @@ -319,13 +332,13 @@ def _validate_models(self, X): for col in cols: if col in dict_col2model: raise ValueError("Duplicate specification of " - f"column {col} found.") + f"column {col} found.") else: dict_col2model[col] = estimator.__class__.__name__.lower() self._cols.append(cols) - # This checks if the no. of columns in the dataset - # matches the columns specified + # This checks if the no. of columns in the dataset + # matches the columns specified # TODO: Lift this restriction and use a `remainder` parameter n_features = X.shape[-1] n_cols_specified = len(dict_col2model) @@ -334,7 +347,7 @@ def _validate_models(self, X): "in X but {} ".format(n_cols_specified) + "were specified.") self.n_features_ = n_features - + # Lastly, check the estimators in self.models for model in self.models: @@ -349,7 +362,7 @@ def _validate_models(self, X): _, estimator, _ = model - # Check if user specified say `GaussianNB()` instead of + # Check if user specified say `GaussianNB()` instead of # `GaussianNB` if callable(estimator): raise ValueError("Estimator should be a callable.")