diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index be64a0299f858..29580366e0f36 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -225,6 +225,11 @@ Support for Python 3.4 and below has been officially dropped. with the document and the caller functions. :issue:`6463` by :user:`movelikeriver `. +- |Fix| :func:`ensemble.partial_dependence` now takes sample weights into + account for the partial dependence computation when the + gradient boosting model has been trained with sample weights. + :issue:`13193` by :user:`Samuel O. Ronsin `. + :mod:`sklearn.externals` ........................ diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index 7536af8edd547..d0c0eeed49592 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -342,15 +342,15 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X, # push left child node_stack[stack_size] = root_node + current_node.left_child current_weight = weight_stack[stack_size] - left_sample_frac = root_node[current_node.left_child].n_node_samples / \ - current_node.n_node_samples + left_sample_frac = root_node[current_node.left_child].weighted_n_node_samples / \ + current_node.weighted_n_node_samples if left_sample_frac <= 0.0 or left_sample_frac >= 1.0: - raise ValueError("left_sample_frac:%f, " - "n_samples current: %d, " - "n_samples left: %d" + raise ValueError("left_sample_frac:%d, " + "weighted_n_node_samples current: %d, " + "weighted_n_node_samples left: %d" % (left_sample_frac, - current_node.n_node_samples, - root_node[current_node.left_child].n_node_samples)) + current_node.weighted_n_node_samples, + root_node[current_node.left_child].weighted_n_node_samples)) weight_stack[stack_size] = current_weight * left_sample_frac stack_size +=1 diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py index 5bdb563199ebf..2321d455aa4e3 100644 --- a/sklearn/ensemble/tests/test_partial_dependence.py +++ b/sklearn/ensemble/tests/test_partial_dependence.py @@ -4,7 +4,7 @@ import pytest import numpy as np -from numpy.testing import assert_array_equal +from numpy.testing import assert_array_equal, assert_allclose from sklearn.utils.testing import assert_raises from sklearn.utils.testing import if_matplotlib @@ -18,8 +18,7 @@ # toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] -T = [[-1, -1], [2, 2], [3, 2]] -true_result = [-1, 1, 1] +sample_weight = [1, 1, 1, 2, 2, 2] # also load the boston dataset boston = datasets.load_boston() @@ -47,6 +46,24 @@ def test_partial_dependence_classifier(): assert axes is None assert_array_equal(pdp, pdp_2) + # with trivial (no-op) sample weights + clf.fit(X, y, sample_weight=np.ones(len(y))) + + pdp_w, axes_w = partial_dependence(clf, [0], X=X, grid_resolution=5) + + assert pdp_w.shape == (1, 4) + assert axes_w[0].shape[0] == 4 + assert_allclose(pdp_w, pdp) + + # with non-trivial sample weights + clf.fit(X, y, sample_weight=sample_weight) + + pdp_w2, axes_w2 = partial_dependence(clf, [0], X=X, grid_resolution=5) + + assert pdp_w2.shape == (1, 4) + assert axes_w2[0].shape[0] == 4 + assert np.all(np.abs(pdp_w2 - pdp_w) / np.abs(pdp_w) > 0.1) + def test_partial_dependence_multiclass(): # Test partial dependence for multi-class classifier @@ -76,6 +93,31 @@ def test_partial_dependence_regressor(): assert axes[0].shape[0] == grid_resolution +def test_partial_dependence_sample_weight(): + # Test near perfect correlation between partial dependence and diagonal + # when sample weights emphasize y = x predictions + N = 1000 + rng = np.random.RandomState(123456) + mask = rng.randint(2, size=N, dtype=bool) + + x = rng.rand(N) + # set y = x on mask and y = -x outside + y = x.copy() + y[~mask] = -y[~mask] + X = np.c_[mask, x] + # sample weights to emphasize data points where y = x + sample_weight = np.ones(N) + sample_weight[mask] = 1000. + + clf = GradientBoostingRegressor(n_estimators=10, random_state=1) + clf.fit(X, y, sample_weight=sample_weight) + + grid = np.arange(0, 1, 0.01) + pdp = partial_dependence(clf, [1], grid=grid) + + assert np.corrcoef(np.ravel(pdp[0]), grid)[0, 1] > 0.99 + + def test_partial_dependecy_input(): # Test input validation of partial dependence. clf = GradientBoostingClassifier(n_estimators=10, random_state=1)