scikit-learn · NicolasHug · Apr 5, 2019 · Feb 18, 2019 · Feb 28, 2019 · Feb 28, 2019
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -225,6 +225,11 @@ Support for Python 3.4 and below has been officially dropped.
   with the document and the caller functions.
   :issue:`6463` by :user:`movelikeriver <movelikeriver>`.
 
+- |Fix| :func:`ensemble.partial_dependence` now takes sample weights into
+  account for the partial dependence computation when the
+  gradient boosting model has been trained with sample weights.
+  :issue:`13193` by :user:`Samuel O. Ronsin <samronsin>`.
+
 :mod:`sklearn.externals`
 ........................
 

diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
@@ -342,15 +342,15 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
                     # push left child
                     node_stack[stack_size] = root_node + current_node.left_child
                     current_weight = weight_stack[stack_size]
-                    left_sample_frac = root_node[current_node.left_child].n_node_samples / \
-                                       <double>current_node.n_node_samples
+                    left_sample_frac = root_node[current_node.left_child].weighted_n_node_samples / \
+                                       current_node.weighted_n_node_samples
                     if left_sample_frac <= 0.0 or left_sample_frac >= 1.0:
-                        raise ValueError("left_sample_frac:%f, "
-                                         "n_samples current: %d, "
-                                         "n_samples left: %d"
+                        raise ValueError("left_sample_frac:%d, "
+                                         "weighted_n_node_samples current: %d, "
+                                         "weighted_n_node_samples left: %d"
                                          % (left_sample_frac,
-                                            current_node.n_node_samples,
-                                            root_node[current_node.left_child].n_node_samples))
+                                            current_node.weighted_n_node_samples,
+                                            root_node[current_node.left_child].weighted_n_node_samples))
                     weight_stack[stack_size] = current_weight * left_sample_frac
                     stack_size +=1
 

diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -4,7 +4,7 @@
 import pytest
 
 import numpy as np
-from numpy.testing import assert_array_equal
+from numpy.testing import assert_array_equal, assert_allclose
 
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import if_matplotlib
@@ -18,8 +18,7 @@
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
 y = [-1, -1, -1, 1, 1, 1]
-T = [[-1, -1], [2, 2], [3, 2]]
-true_result = [-1, 1, 1]
+sample_weight = [1, 1, 1, 2, 2, 2]
 
 # also load the boston dataset
 boston = datasets.load_boston()
@@ -47,6 +46,24 @@ def test_partial_dependence_classifier():
     assert axes is None
     assert_array_equal(pdp, pdp_2)
 
+    # with trivial (no-op) sample weights
+    clf.fit(X, y, sample_weight=np.ones(len(y)))
+
+    pdp_w, axes_w = partial_dependence(clf, [0], X=X, grid_resolution=5)
+
+    assert pdp_w.shape == (1, 4)
+    assert axes_w[0].shape[0] == 4
+    assert_allclose(pdp_w, pdp)
+
+    # with non-trivial sample weights
+    clf.fit(X, y, sample_weight=sample_weight)
+
+    pdp_w2, axes_w2 = partial_dependence(clf, [0], X=X, grid_resolution=5)
+
+    assert pdp_w2.shape == (1, 4)
+    assert axes_w2[0].shape[0] == 4
+    assert np.all(np.abs(pdp_w2 - pdp_w) / np.abs(pdp_w) > 0.1)
+
 
 def test_partial_dependence_multiclass():
     # Test partial dependence for multi-class classifier
@@ -76,6 +93,31 @@ def test_partial_dependence_regressor():
     assert axes[0].shape[0] == grid_resolution
 
 
+def test_partial_dependence_sample_weight():
+    # Test near perfect correlation between partial dependence and diagonal
+    # when sample weights emphasize y = x predictions
+    N = 1000
+    rng = np.random.RandomState(123456)
+    mask = rng.randint(2, size=N, dtype=bool)
+
+    x = rng.rand(N)
+    # set y = x on mask and y = -x outside
+    y = x.copy()
+    y[~mask] = -y[~mask]
+    X = np.c_[mask, x]
+    # sample weights to emphasize data points where y = x
+    sample_weight = np.ones(N)
+    sample_weight[mask] = 1000.
+
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(X, y, sample_weight=sample_weight)
+
+    grid = np.arange(0, 1, 0.01)
+    pdp = partial_dependence(clf, [1], grid=grid)
+
+    assert np.corrcoef(np.ravel(pdp[0]), grid)[0, 1] > 0.99
+
+
 def test_partial_dependecy_input():
     # Test input validation of partial dependence.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)