From 57f9a6f1c1a47d91f049dd716a03edaf42a9a036 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Mon, 19 Oct 2015 07:54:45 -0700
Subject: [PATCH 01/23] general partial dependence plots

---
 sklearn/partial_dependence.py | 165 ++++++++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 sklearn/partial_dependence.py

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
new file mode 100644
index 0000000000000..f971978dbd7c7
--- /dev/null
+++ b/sklearn/partial_dependence.py
@@ -0,0 +1,165 @@
+"""Partial dependence plots for tree ensembles. """
+
+# Authors: Peter Prettenhofer
+# License: BSD 3 clause
+
+from itertools import count
+import numbers
+
+import numpy as np
+from scipy.stats.mstats import mquantiles
+
+from .utils.extmath import cartesian
+from .externals.joblib import Parallel, delayed
+from .externals import six
+from .externals.six.moves import map, range, zip
+from .utils import check_array
+from .tree._tree import DTYPE
+
+from .base import ClassifierMixin, RegressorMixin
+from .ensemble._gradient_boosting import _partial_dependence_tree
+from .ensemble.gradient_boosting import BaseGradientBoosting
+
+
+def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
+    """Generate a grid of points based on the ``percentiles of ``X``.
+
+    The grid is generated by placing ``grid_resolution`` equally
+    spaced points between the ``percentiles`` of each column
+    of ``X``.
+
+    Parameters
+    ----------
+    X : ndarray
+        The data
+    percentiles : tuple of floats
+        The percentiles which are used to construct the extreme
+        values of the grid axes.
+    grid_resolution : int
+        The number of equally spaced points that are placed
+        on the grid.
+
+    Returns
+    -------
+    grid : ndarray
+        All data points on the grid; ``grid.shape[1] == X.shape[1]``
+        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
+    axes : seq of ndarray
+        The axes with which the grid has been created.
+    """
+    if len(percentiles) != 2:
+        raise ValueError('percentile must be tuple of len 2')
+    if not all(0. <= x <= 1. for x in percentiles):
+        raise ValueError('percentile values must be in [0, 1]')
+
+    axes = []
+    for col in range(X.shape[1]):
+        uniques = np.unique(X[:, col])
+        if uniques.shape[0] < grid_resolution:
+            # feature has low resolution use unique vals
+            axis = uniques
+        else:
+            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
+            # create axis based on percentiles and grid resolution
+            axis = np.linspace(emp_percentiles[0, col],
+                               emp_percentiles[1, col],
+                               num=grid_resolution, endpoint=True)
+        axes.append(axis)
+
+    return cartesian(axes), axes
+
+
+def partial_dependence(gbrt, target_variables, grid=None, X=None,
+                       percentiles=(0.05, 0.95), grid_resolution=100):
+    """Partial dependence of ``target_variables``.
+
+    Partial dependence plots show the dependence between the joint values
+    of the ``target_variables`` and the function represented
+    by the ``gbrt``.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    Parameters
+    ----------
+    gbrt : BaseGradientBoosting
+        A fitted gradient boosting model.
+    target_variables : array-like, dtype=int
+        The target features for which the partial dependecy should be
+        computed (size should be smaller than 3 for visual renderings).
+    grid : array-like, shape=(n_points, len(target_variables))
+        The grid of ``target_variables`` values for which the
+        partial dependecy should be evaluated (either ``grid`` or ``X``
+        must be specified).
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``gbrt`` was trained. It is used to generate
+        a ``grid`` for the ``target_variables``. The ``grid`` comprises
+        ``grid_resolution`` equally spaced points between the two
+        ``percentiles``.
+    percentiles : (low, high), default=(0.05, 0.95)
+        The lower and upper percentile used create the extreme values
+        for the ``grid``. Only if ``X`` is not None.
+    grid_resolution : int, default=100
+        The number of equally spaced points on the ``grid``.
+
+    Returns
+    -------
+    pdp : array, shape=(n_classes, n_points)
+        The partial dependence function evaluated on the ``grid``.
+        For regression and binary classification ``n_classes==1``.
+    axes : seq of ndarray or None
+        The axes with which the grid has been created or None if
+        the grid has been given.
+
+    Examples
+    --------
+    >>> samples = [[0, 0, 2], [1, 0, 0]]
+    >>> labels = [0, 1]
+    >>> from sklearn.ensemble import GradientBoostingClassifier
+    >>> gb = GradientBoostingClassifier(random_state=0).fit(samples, labels)
+    >>> kwargs = dict(X=samples, percentiles=(0, 1), grid_resolution=2)
+    >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
+    (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
+    """
+    if not isinstance(gbrt, BaseGradientBoosting):
+        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
+    if gbrt.estimators_.shape[0] == 0:
+        raise ValueError('Call %s.fit before partial_dependence' %
+                         gbrt.__class__.__name__)
+    if (grid is None and X is None) or (grid is not None and X is not None):
+        raise ValueError('Either grid or X must be specified')
+
+    target_variables = np.asarray(target_variables, dtype=np.int32,
+                                  order='C').ravel()
+
+    if any([not (0 <= fx < gbrt.n_features) for fx in target_variables]):
+        raise ValueError('target_variables must be in [0, %d]'
+                         % (gbrt.n_features - 1))
+
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
+        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
+                                  grid_resolution)
+    else:
+        assert grid is not None
+        # dont return axes if grid is given
+        axes = None
+        # grid must be 2d
+        if grid.ndim == 1:
+            grid = grid[:, np.newaxis]
+        if grid.ndim != 2:
+            raise ValueError('grid must be 2d but is %dd' % grid.ndim)
+
+    grid = np.asarray(grid, dtype=DTYPE, order='C')
+    assert grid.shape[1] == target_variables.shape[0]
+
+    n_trees_per_stage = gbrt.estimators_.shape[1]
+    n_estimators = gbrt.estimators_.shape[0]
+    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
+                   order='C')
+    for stage in range(n_estimators):
+        for k in range(n_trees_per_stage):
+            tree = gbrt.estimators_[stage, k].tree_
+            _partial_dependence_tree(tree, grid, target_variables,
+                                     gbrt.learning_rate, pdp[k])
+
+    return pdp, axes

From 9a09888e27fec5929195954d327c3a74f84b542c Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 20 Oct 2015 19:34:36 -0700
Subject: [PATCH 02/23] add init

---
 sklearn/__init__.py           | 7 ++++---
 sklearn/partial_dependence.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/__init__.py b/sklearn/__init__.py
index dbb7862d8839e..1fa157fb68d4d 100644
--- a/sklearn/__init__.py
+++ b/sklearn/__init__.py
@@ -71,9 +71,10 @@
                'isotonic', 'kernel_approximation', 'kernel_ridge',
                'learning_curve', 'linear_model', 'manifold', 'metrics',
                'mixture', 'model_selection', 'multiclass', 'multioutput',
-               'naive_bayes', 'neighbors', 'neural_network', 'pipeline',
-               'preprocessing', 'random_projection', 'semi_supervised',
-               'svm', 'tree', 'discriminant_analysis', 'impute', 'compose',
+               'naive_bayes', 'neighbors', 'neural_network',
+               'partial_dependence', 'pipeline', 'preprocessing',
+               'random_projection', 'semi_supervised', 'svm', 'tree',
+               'discriminant_analysis', 'impute', 'compose',
                # Non-modules:
                'clone', 'get_config', 'set_config', 'config_context']
 
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index f971978dbd7c7..7646d51743cf4 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -1,4 +1,4 @@
-"""Partial dependence plots for tree ensembles. """
+"""Partial dependence plots for regressors and classifiers. """
 
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause

From e714e16d56fd2e0a7125a78f1793378bdb58d408 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 1 Nov 2015 11:36:15 -0800
Subject: [PATCH 03/23] implement exact and estimated methods

---
 sklearn/partial_dependence.py | 461 ++++++++++++++++++++++++++++++++--
 1 file changed, 435 insertions(+), 26 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 7646d51743cf4..0afdfe06cf819 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -1,6 +1,7 @@
-"""Partial dependence plots for regressors and classifiers. """
+"""Partial dependence plots for regression and classification models. """
 
 # Authors: Peter Prettenhofer
+#          Trevor Stephens
 # License: BSD 3 clause
 
 from itertools import count
@@ -19,6 +20,10 @@
 from .base import ClassifierMixin, RegressorMixin
 from .ensemble._gradient_boosting import _partial_dependence_tree
 from .ensemble.gradient_boosting import BaseGradientBoosting
+from .ensemble.forest import ForestRegressor
+
+
+__all__ = ['partial_dependence', 'plot_partial_dependence']
 
 
 def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
@@ -69,29 +74,147 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def partial_dependence(gbrt, target_variables, grid=None, X=None,
-                       percentiles=(0.05, 0.95), grid_resolution=100):
+def _exact_partial_dependence(est, target_variables, grid, X):
+    """Calculate the partial dependence of ``target_variables``.
+
+    The function will be calculated by calling the ``predict_proba`` method of
+    ``est`` for classification or ``predict`` for regression on ``X`` for every
+    point in the grid.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted classification or regression model.
+    target_variables : array-like, dtype=int
+        The target features for which the partial dependency should be
+        computed (size should be smaller than 3 for visual renderings).
+    grid : array-like, shape=(n_points, len(target_variables))
+        The grid of ``target_variables`` values for which the
+        partial dependency should be evaluated (either ``grid`` or ``X``
+        must be specified).
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``est`` was trained.
+
+    Returns
+    -------
+    pdp : array, shape=(n_classes, n_points)
+        The partial dependence function evaluated on the ``grid``.
+        For regression and binary classification ``n_classes==1``.
+    """
+    n_samples = X.shape[0]
+    pdp = []
+    for row in range(grid.shape[0]):
+        X_eval = X.copy()
+        for i, variable in enumerate(target_variables):
+            X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
+        if isinstance(est, RegressorMixin):
+            try:
+                pdp.append(np.mean(est.predict(X_eval)))
+            except:
+                raise ValueError('Call %s.fit before partial_dependence' %
+                                 est.__class__.__name__)
+        elif isinstance(est, ClassifierMixin):
+            try:
+                pdp_row = est.predict_proba(X_eval)
+            except:
+                raise ValueError('Call %s.fit before partial_dependence' %
+                                 est.__class__.__name__)
+            pdp_row = np.log(np.clip(pdp_row, 1e-16, 1))
+            pdp_row = np.subtract(pdp_row,
+                                  np.mean(pdp_row, 1)[:, np.newaxis])
+            pdp.append(np.mean(pdp_row, 0))
+        else:
+            raise ValueError('est must be a fitted regressor or classifier '
+                             'model.')
+    pdp = np.array(pdp).transpose()
+    if pdp.shape[0] == 2:
+        # Binary classification
+        pdp = pdp[1, :][np.newaxis]
+    elif len(pdp.shape) == 1:
+        # Regression
+        pdp = pdp[np.newaxis]
+    return pdp
+
+
+def _estimated_partial_dependence(est, target_variables, grid, X):
+    """Calculate the partial dependence of ``target_variables``.
+
+    The function will be calculated by calling the ``predict_proba`` method of
+    ``est`` for classification or ``predict`` for regression on the mean of
+    ``X``.
+
+    Parameters
+    ----------
+    est : BaseEstimator
+        A fitted classification or regression model.
+    target_variables : array-like, dtype=int
+        The target features for which the partial dependency should be
+        computed (size should be smaller than 3 for visual renderings).
+    grid : array-like, shape=(n_points, len(target_variables))
+        The grid of ``target_variables`` values for which the
+        partial dependency should be evaluated (either ``grid`` or ``X``
+        must be specified).
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``est`` was trained.
+
+    Returns
+    -------
+    pdp : array, shape=(n_classes, n_points)
+        The partial dependence function evaluated on the ``grid``.
+        For regression and binary classification ``n_classes==1``.
+    """
+    n_samples = grid.shape[0]
+    X_eval = np.tile(X.mean(0), [n_samples, 1])
+    for i, variable in enumerate(target_variables):
+        X_eval[:, variable] = grid[:, i]
+    if isinstance(est, RegressorMixin):
+        try:
+            pdp = est.predict(X_eval)
+        except:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        pdp = pdp[np.newaxis]
+    elif isinstance(est, ClassifierMixin):
+        try:
+            pdp = est.predict_proba(X_eval)
+        except:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        pdp = np.log(np.clip(pdp, 1e-16, 1))
+        pdp = np.subtract(pdp, np.mean(pdp, 1)[:, np.newaxis])
+        pdp = pdp.transpose()
+    else:
+        raise ValueError('est must be a fitted regressor or classifier model.')
+    if pdp.shape[0] == 2:
+        # Binary classification
+        pdp = pdp[1, :][np.newaxis]
+    return pdp
+
+
+def partial_dependence(est, target_variables, grid=None, X=None,
+                       percentiles=(0.05, 0.95), grid_resolution=100,
+                       method=None):
     """Partial dependence of ``target_variables``.
 
     Partial dependence plots show the dependence between the joint values
     of the ``target_variables`` and the function represented
-    by the ``gbrt``.
+    by the ``est``.
 
     Read more in the :ref:`User Guide <partial_dependence>`.
 
     Parameters
     ----------
-    gbrt : BaseGradientBoosting
-        A fitted gradient boosting model.
+    est : BaseEstimator
+        A fitted classification or regression model.
     target_variables : array-like, dtype=int
-        The target features for which the partial dependecy should be
+        The target features for which the partial dependency should be
         computed (size should be smaller than 3 for visual renderings).
     grid : array-like, shape=(n_points, len(target_variables))
         The grid of ``target_variables`` values for which the
-        partial dependecy should be evaluated (either ``grid`` or ``X``
+        partial dependency should be evaluated (either ``grid`` or ``X``
         must be specified).
     X : array-like, shape=(n_samples, n_features)
-        The data on which ``gbrt`` was trained. It is used to generate
+        The data on which ``est`` was trained. It is used to generate
         a ``grid`` for the ``target_variables``. The ``grid`` comprises
         ``grid_resolution`` equally spaced points between the two
         ``percentiles``.
@@ -100,6 +223,22 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
         for the ``grid``. Only if ``X`` is not None.
     grid_resolution : int, default=100
         The number of equally spaced points on the ``grid``.
+    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
+        The method to use to calculate the partial dependence function:
+
+        - If 'recursion', the underlying trees of ``est`` will be recursed to
+          calculate the function. Only supported for BaseGradientBoosting and
+          ForestRegressor.
+        - If 'exact', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on ``X``for every point in the grid. To speed up this
+          method, you can use a subset of ``X`` or a more coarse grid.
+        - If 'estimated', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on the mean of ``X``.
+        - If None, then 'recursion' will be used if ``est`` is
+          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
+          estimators.
 
     Returns
     -------
@@ -120,20 +259,42 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    if gbrt.estimators_.shape[0] == 0:
-        raise ValueError('Call %s.fit before partial_dependence' %
-                         gbrt.__class__.__name__)
+    if method is None:
+        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+            method = 'recursion'
+        else:
+            method = 'exact'
+    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and
+            method == 'recursion'):
+        raise ValueError('est has to be an instance of BaseGradientBoosting or'
+                         ' ForestRegressor for the "recursion" method. Try '
+                         'using method="exact" or "estimated".')
+    if (method != 'recursion' and
+            not hasattr(est, 'predict_proba') and
+            isinstance(est, ClassifierMixin)):
+        raise ValueError('est requires a predict_proba method for '
+                         'method="exact" or "estimated".')
+    if method == 'recursion':
+        if len(est.estimators_) == 0:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        if isinstance(est, BaseGradientBoosting):
+            n_features = est.n_features
+        else:
+            n_features = est.n_features_
+    elif X is None:
+        raise ValueError('X is required for method="exact" or "estimated".')
+    else:
+        n_features = X.shape[1]
     if (grid is None and X is None) or (grid is not None and X is not None):
         raise ValueError('Either grid or X must be specified')
 
     target_variables = np.asarray(target_variables, dtype=np.int32,
                                   order='C').ravel()
 
-    if any([not (0 <= fx < gbrt.n_features) for fx in target_variables]):
+    if any([not (0 <= fx < n_features) for fx in target_variables]):
         raise ValueError('target_variables must be in [0, %d]'
-                         % (gbrt.n_features - 1))
+                         % (n_features - 1))
 
     if X is not None:
         X = check_array(X, dtype=DTYPE, order='C')
@@ -141,7 +302,7 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
                                   grid_resolution)
     else:
         assert grid is not None
-        # dont return axes if grid is given
+        # don't return axes if grid is given
         axes = None
         # grid must be 2d
         if grid.ndim == 1:
@@ -152,14 +313,262 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     grid = np.asarray(grid, dtype=DTYPE, order='C')
     assert grid.shape[1] == target_variables.shape[0]
 
-    n_trees_per_stage = gbrt.estimators_.shape[1]
-    n_estimators = gbrt.estimators_.shape[0]
-    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
-                   order='C')
-    for stage in range(n_estimators):
-        for k in range(n_trees_per_stage):
-            tree = gbrt.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, target_variables,
-                                     gbrt.learning_rate, pdp[k])
+    if method == 'recursion':
+        if isinstance(est, BaseGradientBoosting):
+            n_trees_per_stage = est.estimators_.shape[1]
+            n_estimators = est.estimators_.shape[0]
+            learning_rate = est.learning_rate
+        else:
+            n_trees_per_stage = 1
+            n_estimators = len(est.estimators_)
+            learning_rate = 1.
+        pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
+                       order='C')
+        for stage in range(n_estimators):
+            for k in range(n_trees_per_stage):
+                if isinstance(est, BaseGradientBoosting):
+                    tree = est.estimators_[stage, k].tree_
+                else:
+                    tree = est.estimators_[stage].tree_
+                _partial_dependence_tree(tree, grid, target_variables,
+                                         learning_rate, pdp[k])
+        if isinstance(est, ForestRegressor):
+            pdp /= n_estimators
+    elif method == 'exact':
+        pdp = _exact_partial_dependence(est, target_variables, grid, X)
+    elif method == 'estimated':
+        pdp = _estimated_partial_dependence(est, target_variables, grid, X)
+    else:
+        raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
+                         '"estimated", or None.' % method)
 
     return pdp, axes
+
+
+def plot_partial_dependence(gbrt, X, features, feature_names=None,
+                            label=None, n_cols=3, grid_resolution=100,
+                            percentiles=(0.05, 0.95), n_jobs=1,
+                            verbose=0, ax=None, line_kw=None,
+                            contour_kw=None, **fig_kw):
+    """Partial dependence plots for ``features``.
+
+    The ``len(features)`` plots are arranged in a grid with ``n_cols``
+    columns. Two-way partial dependence plots are plotted as contour
+    plots.
+
+    Read more in the :ref:`User Guide <partial_dependence>`.
+
+    Parameters
+    ----------
+    gbrt : BaseGradientBoosting
+        A fitted gradient boosting model.
+    X : array-like, shape=(n_samples, n_features)
+        The data on which ``gbrt`` was trained.
+    features : seq of tuples or ints
+        If seq[i] is an int or a tuple with one int value, a one-way
+        PDP is created; if seq[i] is a tuple of two ints, a two-way
+        PDP is created.
+    feature_names : seq of str
+        Name of each feature; feature_names[i] holds
+        the name of the feature with index i.
+    label : object
+        The class label for which the PDPs should be computed.
+        Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``.
+    n_cols : int
+        The number of columns in the grid plot (default: 3).
+    percentiles : (low, high), default=(0.05, 0.95)
+        The lower and upper percentile used to create the extreme values
+        for the PDP axes.
+    grid_resolution : int, default=100
+        The number of equally spaced points on the axes.
+    n_jobs : int
+        The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
+        Defaults to 1.
+    verbose : int
+        Verbose output during PD computations. Defaults to 0.
+    ax : Matplotlib axis object, default None
+        An axis object onto which the plots will be drawn.
+    line_kw : dict
+        Dict with keywords passed to the ``pylab.plot`` call.
+        For one-way partial dependence plots.
+    contour_kw : dict
+        Dict with keywords passed to the ``pylab.plot`` call.
+        For two-way partial dependence plots.
+    fig_kw : dict
+        Dict with keywords passed to the figure() call.
+        Note that all keywords not recognized above will be automatically
+        included here.
+
+    Returns
+    -------
+    fig : figure
+        The Matplotlib Figure object.
+    axs : seq of Axis objects
+        A seq of Axis objects, one for each subplot.
+
+    Examples
+    --------
+    >>> from sklearn.datasets import make_friedman1
+    >>> from sklearn.ensemble import GradientBoostingRegressor
+    >>> X, y = make_friedman1()
+    >>> clf = GradientBoostingRegressor(n_estimators=10).fit(X, y)
+    >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
+    ...
+    """
+    import matplotlib.pyplot as plt
+    from matplotlib import transforms
+    from matplotlib.ticker import MaxNLocator
+    from matplotlib.ticker import ScalarFormatter
+
+    if not isinstance(gbrt, BaseGradientBoosting):
+        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
+    if gbrt.estimators_.shape[0] == 0:
+        raise ValueError('Call %s.fit before partial_dependence' %
+                         gbrt.__class__.__name__)
+
+    # set label_idx for multi-class GBRT
+    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
+        if label is None:
+            raise ValueError('label is not given for multi-class PDP')
+        label_idx = np.searchsorted(gbrt.classes_, label)
+        if gbrt.classes_[label_idx] != label:
+            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
+    else:
+        # regression and binary classification
+        label_idx = 0
+
+    X = check_array(X, dtype=DTYPE, order='C')
+    if gbrt.n_features != X.shape[1]:
+        raise ValueError('X.shape[1] does not match gbrt.n_features')
+
+    if line_kw is None:
+        line_kw = {'color': 'green'}
+    if contour_kw is None:
+        contour_kw = {}
+
+    # convert feature_names to list
+    if feature_names is None:
+        # if not feature_names use fx indices as name
+        feature_names = [str(i) for i in range(gbrt.n_features)]
+    elif isinstance(feature_names, np.ndarray):
+        feature_names = feature_names.tolist()
+
+    def convert_feature(fx):
+        if isinstance(fx, six.string_types):
+            try:
+                fx = feature_names.index(fx)
+            except ValueError:
+                raise ValueError('Feature %s not in feature_names' % fx)
+        return fx
+
+    # convert features into a seq of int tuples
+    tmp_features = []
+    for fxs in features:
+        if isinstance(fxs, (numbers.Integral,) + six.string_types):
+            fxs = (fxs,)
+        try:
+            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
+        except TypeError:
+            raise ValueError('features must be either int, str, or tuple '
+                             'of int/str')
+        if not (1 <= np.size(fxs) <= 2):
+            raise ValueError('target features must be either one or two')
+
+        tmp_features.append(fxs)
+
+    features = tmp_features
+
+    names = []
+    try:
+        for fxs in features:
+            l = []
+            # explicit loop so "i" is bound for exception below
+            for i in fxs:
+                l.append(feature_names[i])
+            names.append(l)
+    except IndexError:
+        raise ValueError('features[i] must be in [0, n_features) '
+                         'but was %d' % i)
+
+    # compute PD functions
+    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
+        delayed(partial_dependence)(gbrt, fxs, X=X,
+                                    grid_resolution=grid_resolution,
+                                    percentiles=percentiles)
+        for fxs in features)
+
+    # get global min and max values of PD grouped by plot type
+    pdp_lim = {}
+    for pdp, axes in pd_result:
+        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
+        n_fx = len(axes)
+        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
+        min_pd = min(min_pd, old_min_pd)
+        max_pd = max(max_pd, old_max_pd)
+        pdp_lim[n_fx] = (min_pd, max_pd)
+
+    # create contour levels for two-way plots
+    if 2 in pdp_lim:
+        Z_level = np.linspace(*pdp_lim[2], num=8)
+
+    if ax is None:
+        fig = plt.figure(**fig_kw)
+    else:
+        fig = ax.get_figure()
+        fig.clear()
+
+    n_cols = min(n_cols, len(features))
+    n_rows = int(np.ceil(len(features) / float(n_cols)))
+    axs = []
+    for i, fx, name, (pdp, axes) in zip(count(), features, names,
+                                        pd_result):
+        ax = fig.add_subplot(n_rows, n_cols, i + 1)
+
+        if len(axes) == 1:
+            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
+        else:
+            # make contour plot
+            assert len(axes) == 2
+            XX, YY = np.meshgrid(axes[0], axes[1])
+            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
+            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
+                            colors='k')
+            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
+                        vmin=Z_level[0], alpha=0.75, **contour_kw)
+            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
+
+        # plot data deciles + axes labels
+        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
+        trans = transforms.blended_transform_factory(ax.transData,
+                                                     ax.transAxes)
+        ylim = ax.get_ylim()
+        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
+        ax.set_xlabel(name[0])
+        ax.set_ylim(ylim)
+
+        # prevent x-axis ticks from overlapping
+        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
+        tick_formatter = ScalarFormatter()
+        tick_formatter.set_powerlimits((-3, 4))
+        ax.xaxis.set_major_formatter(tick_formatter)
+
+        if len(axes) > 1:
+            # two-way PDP - y-axis deciles + labels
+            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
+            trans = transforms.blended_transform_factory(ax.transAxes,
+                                                         ax.transData)
+            xlim = ax.get_xlim()
+            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
+            ax.set_ylabel(name[1])
+            # hline erases xlim
+            ax.set_xlim(xlim)
+        else:
+            ax.set_ylabel('Partial dependence')
+
+        if len(axes) == 1:
+            ax.set_ylim(pdp_lim[1])
+        axs.append(ax)
+
+    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
+                        hspace=0.3)
+    return fig, axs

From 19ed28ebf670c1be1aa64a1c9a089336f7f21d86 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Thu, 5 Nov 2015 22:08:18 -0800
Subject: [PATCH 04/23] support for Pipeline and GridSearchCV type estimators

---
 sklearn/partial_dependence.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 0afdfe06cf819..262f909ff100b 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -17,7 +17,6 @@
 from .utils import check_array
 from .tree._tree import DTYPE
 
-from .base import ClassifierMixin, RegressorMixin
 from .ensemble._gradient_boosting import _partial_dependence_tree
 from .ensemble.gradient_boosting import BaseGradientBoosting
 from .ensemble.forest import ForestRegressor
@@ -74,7 +73,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _exact_partial_dependence(est, target_variables, grid, X):
+def _exact_partial_dependence(est, target_variables, grid, X, ouput=None):
     """Calculate the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -94,6 +93,8 @@ def _exact_partial_dependence(est, target_variables, grid, X):
         must be specified).
     X : array-like, shape=(n_samples, n_features)
         The data on which ``est`` was trained.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
 
     Returns
     -------
@@ -107,13 +108,13 @@ def _exact_partial_dependence(est, target_variables, grid, X):
         X_eval = X.copy()
         for i, variable in enumerate(target_variables):
             X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
-        if isinstance(est, RegressorMixin):
+        if est._estimator_type == 'regressor':
             try:
                 pdp.append(np.mean(est.predict(X_eval)))
             except:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
-        elif isinstance(est, ClassifierMixin):
+        elif est._estimator_type == 'classifier':
             try:
                 pdp_row = est.predict_proba(X_eval)
             except:
@@ -136,7 +137,7 @@ def _exact_partial_dependence(est, target_variables, grid, X):
     return pdp
 
 
-def _estimated_partial_dependence(est, target_variables, grid, X):
+def _estimated_partial_dependence(est, target_variables, grid, X, ouput=None):
     """Calculate the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -156,6 +157,8 @@ def _estimated_partial_dependence(est, target_variables, grid, X):
         must be specified).
     X : array-like, shape=(n_samples, n_features)
         The data on which ``est`` was trained.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
 
     Returns
     -------
@@ -167,14 +170,14 @@ def _estimated_partial_dependence(est, target_variables, grid, X):
     X_eval = np.tile(X.mean(0), [n_samples, 1])
     for i, variable in enumerate(target_variables):
         X_eval[:, variable] = grid[:, i]
-    if isinstance(est, RegressorMixin):
+    if est._estimator_type == 'regressor':
         try:
             pdp = est.predict(X_eval)
         except:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
         pdp = pdp[np.newaxis]
-    elif isinstance(est, ClassifierMixin):
+    elif est._estimator_type == 'classifier':
         try:
             pdp = est.predict_proba(X_eval)
         except:
@@ -191,7 +194,7 @@ def _estimated_partial_dependence(est, target_variables, grid, X):
     return pdp
 
 
-def partial_dependence(est, target_variables, grid=None, X=None,
+def partial_dependence(est, target_variables, grid=None, X=None, output=None,
                        percentiles=(0.05, 0.95), grid_resolution=100,
                        method=None):
     """Partial dependence of ``target_variables``.
@@ -218,6 +221,8 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         a ``grid`` for the ``target_variables``. The ``grid`` comprises
         ``grid_resolution`` equally spaced points between the two
         ``percentiles``.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used create the extreme values
         for the ``grid``. Only if ``X`` is not None.
@@ -269,11 +274,12 @@ def partial_dependence(est, target_variables, grid=None, X=None,
         raise ValueError('est has to be an instance of BaseGradientBoosting or'
                          ' ForestRegressor for the "recursion" method. Try '
                          'using method="exact" or "estimated".')
-    if (method != 'recursion' and
-            not hasattr(est, 'predict_proba') and
-            isinstance(est, ClassifierMixin)):
+    if (not hasattr(est, '_estimator_type') or
+            est._estimator_type not in ('classifier', 'regressor')):
+        raise ValueError('est must be a fitted regressor or classifier model.')
+    if method != 'recursion' and est._estimator_type == 'classifier':
         raise ValueError('est requires a predict_proba method for '
-                         'method="exact" or "estimated".')
+                         'method="exact" or "estimated" for classification.')
     if method == 'recursion':
         if len(est.estimators_) == 0:
             raise ValueError('Call %s.fit before partial_dependence' %

From 6fafc5e585369268fd1b100b470e3ad8687a1c9e Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Mon, 16 Nov 2015 14:56:11 -0800
Subject: [PATCH 05/23] add multioutput support

---
 sklearn/partial_dependence.py | 39 ++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 262f909ff100b..9dacd3d29631b 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -73,7 +73,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _exact_partial_dependence(est, target_variables, grid, X, ouput=None):
+def _exact_partial_dependence(est, target_variables, grid, X, output=None):
     """Calculate the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -110,16 +110,29 @@ def _exact_partial_dependence(est, target_variables, grid, X, ouput=None):
             X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
         if est._estimator_type == 'regressor':
             try:
-                pdp.append(np.mean(est.predict(X_eval)))
+                pdp_row = est.predict(X_eval)
             except:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
+            if pdp_row.ndim != 1 and pdp_row.shape[1] != 1:
+                # Multi-output
+                if not 0 <= output < pdp_row.shape[1]:
+                    raise ValueError('Valid output must be specified for '
+                                     'multi-output models.')
+                pdp_row = pdp_row[:, output]
+            pdp.append(np.mean(pdp_row))
         elif est._estimator_type == 'classifier':
             try:
                 pdp_row = est.predict_proba(X_eval)
             except:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
+            if isinstance(pdp_row, list):
+                # Multi-output
+                if not 0 <= output < len(pdp_row):
+                    raise ValueError('Valid output must be specified for '
+                                     'multi-output models.')
+                pdp_row = pdp_row[output]
             pdp_row = np.log(np.clip(pdp_row, 1e-16, 1))
             pdp_row = np.subtract(pdp_row,
                                   np.mean(pdp_row, 1)[:, np.newaxis])
@@ -137,7 +150,7 @@ def _exact_partial_dependence(est, target_variables, grid, X, ouput=None):
     return pdp
 
 
-def _estimated_partial_dependence(est, target_variables, grid, X, ouput=None):
+def _estimated_partial_dependence(est, target_variables, grid, X, output=None):
     """Calculate the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -176,6 +189,15 @@ def _estimated_partial_dependence(est, target_variables, grid, X, ouput=None):
         except:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
+        if pdp.ndim != 1 and pdp.shape[1] == 1:
+            # Column output
+            pdp = pdp.ravel()
+        if pdp.ndim != 1 and pdp.shape[1] != 1:
+            # Multi-output
+            if not 0 <= output < pdp.shape[1]:
+                raise ValueError('Valid output must be specified for '
+                                 'multi-output models.')
+            pdp = pdp[:, output]
         pdp = pdp[np.newaxis]
     elif est._estimator_type == 'classifier':
         try:
@@ -183,6 +205,12 @@ def _estimated_partial_dependence(est, target_variables, grid, X, ouput=None):
         except:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
+        if isinstance(pdp, list):
+            # Multi-output
+            if not 0 <= output < len(pdp):
+                raise ValueError('Valid output must be specified for '
+                                 'multi-output models.')
+            pdp = pdp[output]
         pdp = np.log(np.clip(pdp, 1e-16, 1))
         pdp = np.subtract(pdp, np.mean(pdp, 1)[:, np.newaxis])
         pdp = pdp.transpose()
@@ -341,9 +369,10 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if isinstance(est, ForestRegressor):
             pdp /= n_estimators
     elif method == 'exact':
-        pdp = _exact_partial_dependence(est, target_variables, grid, X)
+        pdp = _exact_partial_dependence(est, target_variables, grid, X, output)
     elif method == 'estimated':
-        pdp = _estimated_partial_dependence(est, target_variables, grid, X)
+        pdp = _estimated_partial_dependence(est, target_variables, grid, X,
+                                            output)
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)

From 152a190927d60a90a9c59c8f1f343b3d092758e1 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 22 Jul 2017 15:54:14 +1000
Subject: [PATCH 06/23] rebase and catch up to #6762, #7673, #7846

---
 sklearn/partial_dependence.py | 37 +++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 9dacd3d29631b..3faa3f54f04b9 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -20,6 +20,7 @@
 from .ensemble._gradient_boosting import _partial_dependence_tree
 from .ensemble.gradient_boosting import BaseGradientBoosting
 from .ensemble.forest import ForestRegressor
+from .exceptions import NotFittedError
 
 
 __all__ = ['partial_dependence', 'plot_partial_dependence']
@@ -111,7 +112,7 @@ def _exact_partial_dependence(est, target_variables, grid, X, output=None):
         if est._estimator_type == 'regressor':
             try:
                 pdp_row = est.predict(X_eval)
-            except:
+            except NotFittedError:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
             if pdp_row.ndim != 1 and pdp_row.shape[1] != 1:
@@ -124,7 +125,7 @@ def _exact_partial_dependence(est, target_variables, grid, X, output=None):
         elif est._estimator_type == 'classifier':
             try:
                 pdp_row = est.predict_proba(X_eval)
-            except:
+            except NotFittedError:
                 raise ValueError('Call %s.fit before partial_dependence' %
                                  est.__class__.__name__)
             if isinstance(pdp_row, list):
@@ -186,7 +187,7 @@ def _estimated_partial_dependence(est, target_variables, grid, X, output=None):
     if est._estimator_type == 'regressor':
         try:
             pdp = est.predict(X_eval)
-        except:
+        except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
         if pdp.ndim != 1 and pdp.shape[1] == 1:
@@ -202,7 +203,7 @@ def _estimated_partial_dependence(est, target_variables, grid, X, output=None):
     elif est._estimator_type == 'classifier':
         try:
             pdp = est.predict_proba(X_eval)
-        except:
+        except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
         if isinstance(pdp, list):
@@ -252,7 +253,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     output : int, optional (default=None)
         The output index to use for multi-output estimators.
     percentiles : (low, high), default=(0.05, 0.95)
-        The lower and upper percentile used create the extreme values
+        The lower and upper percentile used to create the extreme values
         for the ``grid``. Only if ``X`` is not None.
     grid_resolution : int, default=100
         The number of equally spaced points on the ``grid``.
@@ -312,10 +313,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if len(est.estimators_) == 0:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
-        if isinstance(est, BaseGradientBoosting):
-            n_features = est.n_features
-        else:
-            n_features = est.n_features_
+        n_features = est.n_features_
     elif X is None:
         raise ValueError('X is required for method="exact" or "estimated".')
     else:
@@ -399,10 +397,14 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
         A fitted gradient boosting model.
     X : array-like, shape=(n_samples, n_features)
         The data on which ``gbrt`` was trained.
-    features : seq of tuples or ints
+    features : seq of ints, strings, or tuples of ints or strings
         If seq[i] is an int or a tuple with one int value, a one-way
         PDP is created; if seq[i] is a tuple of two ints, a two-way
         PDP is created.
+        If feature_names is specified and seq[i] is an int, seq[i]
+        must be < len(feature_names).
+        If seq[i] is a string, feature_names must be specified, and
+        seq[i] must be in feature_names.
     feature_names : seq of str
         Name of each feature; feature_names[i] holds
         the name of the feature with index i.
@@ -424,10 +426,10 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     ax : Matplotlib axis object, default None
         An axis object onto which the plots will be drawn.
     line_kw : dict
-        Dict with keywords passed to the ``pylab.plot`` call.
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For one-way partial dependence plots.
     contour_kw : dict
-        Dict with keywords passed to the ``pylab.plot`` call.
+        Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For two-way partial dependence plots.
     fig_kw : dict
         Dict with keywords passed to the figure() call.
@@ -473,8 +475,8 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
         label_idx = 0
 
     X = check_array(X, dtype=DTYPE, order='C')
-    if gbrt.n_features != X.shape[1]:
-        raise ValueError('X.shape[1] does not match gbrt.n_features')
+    if gbrt.n_features_ != X.shape[1]:
+        raise ValueError('X.shape[1] does not match gbrt.n_features_')
 
     if line_kw is None:
         line_kw = {'color': 'green'}
@@ -484,7 +486,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     # convert feature_names to list
     if feature_names is None:
         # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(gbrt.n_features)]
+        feature_names = [str(i) for i in range(gbrt.n_features_)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
 
@@ -522,8 +524,9 @@ def convert_feature(fx):
                 l.append(feature_names[i])
             names.append(l)
     except IndexError:
-        raise ValueError('features[i] must be in [0, n_features) '
-                         'but was %d' % i)
+        raise ValueError('All entries of features must be less than '
+                         'len(feature_names) = {0}, got {1}.'
+                         .format(len(feature_names), i))
 
     # compute PD functions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(

From 2cdc5ea15d072601b21b96da23763c2168ec4169 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Wed, 26 Jul 2017 17:18:55 +1000
Subject: [PATCH 07/23] catch up on #9434

---
 sklearn/partial_dependence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 3faa3f54f04b9..b39d924294ebb 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -58,13 +58,13 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
         raise ValueError('percentile values must be in [0, 1]')
 
     axes = []
+    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
     for col in range(X.shape[1]):
         uniques = np.unique(X[:, col])
         if uniques.shape[0] < grid_resolution:
             # feature has low resolution use unique vals
             axis = uniques
         else:
-            emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
             # create axis based on percentiles and grid resolution
             axis = np.linspace(emp_percentiles[0, col],
                                emp_percentiles[1, col],

From ba1f8daa9c245287d058db91ee63cc3667609b37 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Wed, 26 Jul 2017 18:44:48 +1000
Subject: [PATCH 08/23] initial update of plot_partial_dependence

---
 sklearn/partial_dependence.py | 72 ++++++++++++++++++++++++++---------
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index b39d924294ebb..da33ec1202b79 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -378,9 +378,9 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     return pdp, axes
 
 
-def plot_partial_dependence(gbrt, X, features, feature_names=None,
+def plot_partial_dependence(est, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
-                            percentiles=(0.05, 0.95), n_jobs=1,
+                            method=None, percentiles=(0.05, 0.95), n_jobs=1,
                             verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots for ``features``.
@@ -393,10 +393,10 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
 
     Parameters
     ----------
-    gbrt : BaseGradientBoosting
-        A fitted gradient boosting model.
+    est : BaseEstimator
+        A fitted classification or regression model.
     X : array-like, shape=(n_samples, n_features)
-        The data on which ``gbrt`` was trained.
+        The data on which ``est`` was trained.
     features : seq of ints, strings, or tuples of ints or strings
         If seq[i] is an int or a tuple with one int value, a one-way
         PDP is created; if seq[i] is a tuple of two ints, a two-way
@@ -410,7 +410,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
         the name of the feature with index i.
     label : object
         The class label for which the PDPs should be computed.
-        Only if gbrt is a multi-class model. Must be in ``gbrt.classes_``.
+        Only if est is a multi-class model. Must be in ``est.classes_``.
     n_cols : int
         The number of columns in the grid plot (default: 3).
     percentiles : (low, high), default=(0.05, 0.95)
@@ -418,6 +418,22 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
         for the PDP axes.
     grid_resolution : int, default=100
         The number of equally spaced points on the axes.
+    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
+        The method to use to calculate the partial dependence function:
+
+        - If 'recursion', the underlying trees of ``est`` will be recursed to
+          calculate the function. Only supported for BaseGradientBoosting and
+          ForestRegressor.
+        - If 'exact', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on ``X``for every point in the grid. To speed up this
+          method, you can use a subset of ``X`` or a more coarse grid.
+        - If 'estimated', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on the mean of ``X``.
+        - If None, then 'recursion' will be used if ``est`` is
+          BaseGradientBoosting or ForestRegressor, and 'exact' used for other
+          estimators.
     n_jobs : int
         The number of CPUs to use to compute the PDs. -1 means 'all CPUs'.
         Defaults to 1.
@@ -457,26 +473,46 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     from matplotlib.ticker import MaxNLocator
     from matplotlib.ticker import ScalarFormatter
 
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    if gbrt.estimators_.shape[0] == 0:
-        raise ValueError('Call %s.fit before partial_dependence' %
-                         gbrt.__class__.__name__)
+    if method is None:
+        if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
+            method = 'recursion'
+        else:
+            method = 'exact'
+    if (not isinstance(est, (BaseGradientBoosting, ForestRegressor)) and
+            method == 'recursion'):
+        raise ValueError('est has to be an instance of BaseGradientBoosting or'
+                         ' ForestRegressor for the "recursion" method. Try '
+                         'using method="exact" or "estimated".')
+    if (not hasattr(est, '_estimator_type') or
+            est._estimator_type not in ('classifier', 'regressor')):
+        raise ValueError('est must be a fitted regressor or classifier model.')
+    if method != 'recursion' and est._estimator_type == 'classifier':
+        raise ValueError('est requires a predict_proba method for '
+                         'method="exact" or "estimated" for classification.')
+    if method == 'recursion':
+        if len(est.estimators_) == 0:
+            raise ValueError('Call %s.fit before partial_dependence' %
+                             est.__class__.__name__)
+        n_features = est.n_features_
+    elif X is None:
+        raise ValueError('X is required for method="exact" or "estimated".')
+    else:
+        n_features = X.shape[1]
 
     # set label_idx for multi-class GBRT
-    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
+    if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
         if label is None:
             raise ValueError('label is not given for multi-class PDP')
-        label_idx = np.searchsorted(gbrt.classes_, label)
-        if gbrt.classes_[label_idx] != label:
+        label_idx = np.searchsorted(est.classes_, label)
+        if est.classes_[label_idx] != label:
             raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
     else:
         # regression and binary classification
         label_idx = 0
 
     X = check_array(X, dtype=DTYPE, order='C')
-    if gbrt.n_features_ != X.shape[1]:
-        raise ValueError('X.shape[1] does not match gbrt.n_features_')
+    if est.n_features_ != X.shape[1]:
+        raise ValueError('X.shape[1] does not match est.n_features_')
 
     if line_kw is None:
         line_kw = {'color': 'green'}
@@ -486,7 +522,7 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     # convert feature_names to list
     if feature_names is None:
         # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(gbrt.n_features_)]
+        feature_names = [str(i) for i in range(est.n_features_)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
 
@@ -530,7 +566,7 @@ def convert_feature(fx):
 
     # compute PD functions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(gbrt, fxs, X=X,
+        delayed(partial_dependence)(est, fxs, X=X, method=method,
                                     grid_resolution=grid_resolution,
                                     percentiles=percentiles)
         for fxs in features)

From 1b1d8f0304d250e84bdf61625f5bf8309a073115 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 12 Aug 2017 15:21:40 +1000
Subject: [PATCH 09/23] deprecate ensemble.partial_dependence

---
 sklearn/ensemble/partial_dependence.py        | 294 ++----------------
 .../ensemble/tests/test_partial_dependence.py |  37 +++
 sklearn/partial_dependence.py                 |  13 +-
 sklearn/tests/test_partial_dependence.py      | 206 ++++++++++++
 4 files changed, 283 insertions(+), 267 deletions(-)
 create mode 100644 sklearn/tests/test_partial_dependence.py

diff --git a/sklearn/ensemble/partial_dependence.py b/sklearn/ensemble/partial_dependence.py
index e8bfc2110bb90..63d397d86c8a0 100644
--- a/sklearn/ensemble/partial_dependence.py
+++ b/sklearn/ensemble/partial_dependence.py
@@ -3,70 +3,9 @@
 # Authors: Peter Prettenhofer
 # License: BSD 3 clause
 
-from itertools import count
-import numbers
-
-import numpy as np
-from scipy.stats.mstats import mquantiles
-
-from ..utils.extmath import cartesian
-from ..externals.joblib import Parallel, delayed
-from ..externals import six
-from ..externals.six.moves import map, range, zip
-from ..utils import check_array
-from ..utils.validation import check_is_fitted
-from ..tree._tree import DTYPE
-
-from ._gradient_boosting import _partial_dependence_tree
-from .gradient_boosting import BaseGradientBoosting
-
-
-def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
-    """Generate a grid of points based on the ``percentiles of ``X``.
-
-    The grid is generated by placing ``grid_resolution`` equally
-    spaced points between the ``percentiles`` of each column
-    of ``X``.
-
-    Parameters
-    ----------
-    X : ndarray
-        The data
-    percentiles : tuple of floats
-        The percentiles which are used to construct the extreme
-        values of the grid axes.
-    grid_resolution : int
-        The number of equally spaced points that are placed
-        on the grid.
-
-    Returns
-    -------
-    grid : ndarray
-        All data points on the grid; ``grid.shape[1] == X.shape[1]``
-        and ``grid.shape[0] == grid_resolution * X.shape[1]``.
-    axes : seq of ndarray
-        The axes with which the grid has been created.
-    """
-    if len(percentiles) != 2:
-        raise ValueError('percentile must be tuple of len 2')
-    if not all(0. <= x <= 1. for x in percentiles):
-        raise ValueError('percentile values must be in [0, 1]')
-
-    axes = []
-    emp_percentiles = mquantiles(X, prob=percentiles, axis=0)
-    for col in range(X.shape[1]):
-        uniques = np.unique(X[:, col])
-        if uniques.shape[0] < grid_resolution:
-            # feature has low resolution use unique vals
-            axis = uniques
-        else:
-            # create axis based on percentiles and grid resolution
-            axis = np.linspace(emp_percentiles[0, col],
-                               emp_percentiles[1, col],
-                               num=grid_resolution, endpoint=True)
-        axes.append(axis)
-
-    return cartesian(axes), axes
+import warnings
+from ..partial_dependence import partial_dependence as new_pd
+from ..partial_dependence import plot_partial_dependence as new_ppd
 
 
 def partial_dependence(gbrt, target_variables, grid=None, X=None,
@@ -120,47 +59,17 @@ def partial_dependence(gbrt, target_variables, grid=None, X=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    check_is_fitted(gbrt, 'estimators_')
-    if (grid is None and X is None) or (grid is not None and X is not None):
-        raise ValueError('Either grid or X must be specified')
-
-    target_variables = np.asarray(target_variables, dtype=np.int32,
-                                  order='C').ravel()
-
-    if any([not (0 <= fx < gbrt.n_features_) for fx in target_variables]):
-        raise ValueError('target_variables must be in [0, %d]'
-                         % (gbrt.n_features_ - 1))
-
-    if X is not None:
-        X = check_array(X, dtype=DTYPE, order='C')
-        grid, axes = _grid_from_X(X[:, target_variables], percentiles,
-                                  grid_resolution)
-    else:
-        assert grid is not None
-        # dont return axes if grid is given
-        axes = None
-        # grid must be 2d
-        if grid.ndim == 1:
-            grid = grid[:, np.newaxis]
-        if grid.ndim != 2:
-            raise ValueError('grid must be 2d but is %dd' % grid.ndim)
-
-    grid = np.asarray(grid, dtype=DTYPE, order='C')
-    assert grid.shape[1] == target_variables.shape[0]
-
-    n_trees_per_stage = gbrt.estimators_.shape[1]
-    n_estimators = gbrt.estimators_.shape[0]
-    pdp = np.zeros((n_trees_per_stage, grid.shape[0],), dtype=np.float64,
-                   order='C')
-    for stage in range(n_estimators):
-        for k in range(n_trees_per_stage):
-            tree = gbrt.estimators_[stage, k].tree_
-            _partial_dependence_tree(tree, grid, target_variables,
-                                     gbrt.learning_rate, pdp[k])
-
-    return pdp, axes
+    warnings.warn("The function ensemble.partial_dependence has been moved to "
+                  "partial_dependence in 0.20 and will be removed in 0.22.",
+                  DeprecationWarning)
+    return new_pd(est=gbrt,
+                  target_variables=target_variables,
+                  grid=grid,
+                  X=X,
+                  output=None,
+                  percentiles=percentiles,
+                  grid_resolution=grid_resolution,
+                  method='recursion')
 
 
 def plot_partial_dependence(gbrt, X, features, feature_names=None,
@@ -237,159 +146,22 @@ def plot_partial_dependence(gbrt, X, features, feature_names=None,
     >>> fig, axs = plot_partial_dependence(clf, X, [0, (0, 1)]) #doctest: +SKIP
     ...
     """
-    import matplotlib.pyplot as plt
-    from matplotlib import transforms
-    from matplotlib.ticker import MaxNLocator
-    from matplotlib.ticker import ScalarFormatter
-
-    if not isinstance(gbrt, BaseGradientBoosting):
-        raise ValueError('gbrt has to be an instance of BaseGradientBoosting')
-    check_is_fitted(gbrt, 'estimators_')
-
-    # set label_idx for multi-class GBRT
-    if hasattr(gbrt, 'classes_') and np.size(gbrt.classes_) > 2:
-        if label is None:
-            raise ValueError('label is not given for multi-class PDP')
-        label_idx = np.searchsorted(gbrt.classes_, label)
-        if gbrt.classes_[label_idx] != label:
-            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
-    else:
-        # regression and binary classification
-        label_idx = 0
-
-    X = check_array(X, dtype=DTYPE, order='C')
-    if gbrt.n_features_ != X.shape[1]:
-        raise ValueError('X.shape[1] does not match gbrt.n_features_')
-
-    if line_kw is None:
-        line_kw = {'color': 'green'}
-    if contour_kw is None:
-        contour_kw = {}
-
-    # convert feature_names to list
-    if feature_names is None:
-        # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(gbrt.n_features_)]
-    elif isinstance(feature_names, np.ndarray):
-        feature_names = feature_names.tolist()
-
-    def convert_feature(fx):
-        if isinstance(fx, six.string_types):
-            try:
-                fx = feature_names.index(fx)
-            except ValueError:
-                raise ValueError('Feature %s not in feature_names' % fx)
-        return fx
-
-    # convert features into a seq of int tuples
-    tmp_features = []
-    for fxs in features:
-        if isinstance(fxs, (numbers.Integral,) + six.string_types):
-            fxs = (fxs,)
-        try:
-            fxs = np.array([convert_feature(fx) for fx in fxs], dtype=np.int32)
-        except TypeError:
-            raise ValueError('features must be either int, str, or tuple '
-                             'of int/str')
-        if not (1 <= np.size(fxs) <= 2):
-            raise ValueError('target features must be either one or two')
-
-        tmp_features.append(fxs)
-
-    features = tmp_features
-
-    names = []
-    try:
-        for fxs in features:
-            l = []
-            # explicit loop so "i" is bound for exception below
-            for i in fxs:
-                l.append(feature_names[i])
-            names.append(l)
-    except IndexError:
-        raise ValueError('All entries of features must be less than '
-                         'len(feature_names) = {0}, got {1}.'
-                         .format(len(feature_names), i))
-
-    # compute PD functions
-    pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(gbrt, fxs, X=X,
-                                    grid_resolution=grid_resolution,
-                                    percentiles=percentiles)
-        for fxs in features)
-
-    # get global min and max values of PD grouped by plot type
-    pdp_lim = {}
-    for pdp, axes in pd_result:
-        min_pd, max_pd = pdp[label_idx].min(), pdp[label_idx].max()
-        n_fx = len(axes)
-        old_min_pd, old_max_pd = pdp_lim.get(n_fx, (min_pd, max_pd))
-        min_pd = min(min_pd, old_min_pd)
-        max_pd = max(max_pd, old_max_pd)
-        pdp_lim[n_fx] = (min_pd, max_pd)
-
-    # create contour levels for two-way plots
-    if 2 in pdp_lim:
-        Z_level = np.linspace(*pdp_lim[2], num=8)
-
-    if ax is None:
-        fig = plt.figure(**fig_kw)
-    else:
-        fig = ax.get_figure()
-        fig.clear()
-
-    n_cols = min(n_cols, len(features))
-    n_rows = int(np.ceil(len(features) / float(n_cols)))
-    axs = []
-    for i, fx, name, (pdp, axes) in zip(count(), features, names,
-                                        pd_result):
-        ax = fig.add_subplot(n_rows, n_cols, i + 1)
-
-        if len(axes) == 1:
-            ax.plot(axes[0], pdp[label_idx].ravel(), **line_kw)
-        else:
-            # make contour plot
-            assert len(axes) == 2
-            XX, YY = np.meshgrid(axes[0], axes[1])
-            Z = pdp[label_idx].reshape(list(map(np.size, axes))).T
-            CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5,
-                            colors='k')
-            ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1],
-                        vmin=Z_level[0], alpha=0.75, **contour_kw)
-            ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True)
-
-        # plot data deciles + axes labels
-        deciles = mquantiles(X[:, fx[0]], prob=np.arange(0.1, 1.0, 0.1))
-        trans = transforms.blended_transform_factory(ax.transData,
-                                                     ax.transAxes)
-        ylim = ax.get_ylim()
-        ax.vlines(deciles, [0], 0.05, transform=trans, color='k')
-        ax.set_xlabel(name[0])
-        ax.set_ylim(ylim)
-
-        # prevent x-axis ticks from overlapping
-        ax.xaxis.set_major_locator(MaxNLocator(nbins=6, prune='lower'))
-        tick_formatter = ScalarFormatter()
-        tick_formatter.set_powerlimits((-3, 4))
-        ax.xaxis.set_major_formatter(tick_formatter)
-
-        if len(axes) > 1:
-            # two-way PDP - y-axis deciles + labels
-            deciles = mquantiles(X[:, fx[1]], prob=np.arange(0.1, 1.0, 0.1))
-            trans = transforms.blended_transform_factory(ax.transAxes,
-                                                         ax.transData)
-            xlim = ax.get_xlim()
-            ax.hlines(deciles, [0], 0.05, transform=trans, color='k')
-            ax.set_ylabel(name[1])
-            # hline erases xlim
-            ax.set_xlim(xlim)
-        else:
-            ax.set_ylabel('Partial dependence')
-
-        if len(axes) == 1:
-            ax.set_ylim(pdp_lim[1])
-        axs.append(ax)
-
-    fig.subplots_adjust(bottom=0.15, top=0.7, left=0.1, right=0.95, wspace=0.4,
-                        hspace=0.3)
-    return fig, axs
+    warnings.warn("The function ensemble.plot_partial_dependence has been "
+                  "moved to partial_dependence in 0.20 and will be removed "
+                  "in 0.22.",
+                  DeprecationWarning)
+    return new_ppd(est=gbrt,
+                   X=X,
+                   features=features,
+                   feature_names=feature_names,
+                   label=label,
+                   n_cols=n_cols,
+                   grid_resolution=grid_resolution,
+                   method='recursion',
+                   percentiles=percentiles,
+                   n_jobs=n_jobs,
+                   verbose=verbose,
+                   ax=ax,
+                   line_kw=line_kw,
+                   contour_kw=contour_kw,
+                   **fig_kw)
diff --git a/sklearn/ensemble/tests/test_partial_dependence.py b/sklearn/ensemble/tests/test_partial_dependence.py
index cec7efc46f03b..3a45ade617f9e 100644
--- a/sklearn/ensemble/tests/test_partial_dependence.py
+++ b/sklearn/ensemble/tests/test_partial_dependence.py
@@ -12,6 +12,8 @@
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn import datasets
+from sklearn.utils.testing import ignore_warnings
+from sklearn.utils.testing import assert_warns_message
 
 
 # toy sample
@@ -27,6 +29,7 @@
 iris = datasets.load_iris()
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_classifier():
     # Test partial dependence for classifier
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -47,6 +50,7 @@ def test_partial_dependence_classifier():
     assert_array_equal(pdp, pdp_2)
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_multiclass():
     # Test partial dependence for multi-class classifier
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -62,6 +66,7 @@ def test_partial_dependence_multiclass():
     assert axes[0].shape[0] == grid_resolution
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependence_regressor():
     # Test partial dependence for regressor
     clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
@@ -75,6 +80,7 @@ def test_partial_dependence_regressor():
     assert axes[0].shape[0] == grid_resolution
 
 
+@ignore_warnings(category=DeprecationWarning)
 def test_partial_dependecy_input():
     # Test input validation of partial dependence.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -103,6 +109,7 @@ def test_partial_dependecy_input():
     assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
 
 
+@ignore_warnings(category=DeprecationWarning)
 @if_matplotlib
 def test_plot_partial_dependence():
     # Test partial dependence plot function.
@@ -136,6 +143,7 @@ def test_plot_partial_dependence():
 
 
 @if_matplotlib
+@ignore_warnings(category=DeprecationWarning)
 def test_plot_partial_dependence_input():
     # Test partial dependence plot function input checks.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -171,6 +179,7 @@ def test_plot_partial_dependence_input():
 
 
 @if_matplotlib
+@ignore_warnings(category=DeprecationWarning)
 def test_plot_partial_dependence_multiclass():
     # Test partial dependence plot function on multi-class input.
     clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
@@ -204,3 +213,31 @@ def test_plot_partial_dependence_multiclass():
     assert_raises(ValueError, plot_partial_dependence,
                   clf, iris.data, [0, 1],
                   grid_resolution=grid_resolution)
+
+
+def test_warning_raised_for_partial_dependence():
+    # Test that running the old partial_dependence function warns
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+    grid_resolution = 25
+
+    assert_warns_message(DeprecationWarning, "The function "
+                         "ensemble.partial_dependence has been moved to "
+                         "partial_dependence in 0.20 and will be removed in "
+                         "0.22.", partial_dependence, clf, [0], X=boston.data,
+                         grid_resolution=grid_resolution)
+
+
+@if_matplotlib
+def test_warning_raised_for_plot_partial_dependence():
+    # Test that running the old partial_dependence function warns
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+    grid_resolution = 25
+
+    assert_warns_message(DeprecationWarning, "The function "
+                         "ensemble.plot_partial_dependence has been moved to "
+                         "partial_dependence in 0.20 and will be removed in "
+                         "0.22.", plot_partial_dependence, clf, boston.data,
+                         [0, 1, (0, 1)], grid_resolution=grid_resolution,
+                         feature_names=boston.feature_names)
diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index da33ec1202b79..f4966b35f056d 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -15,6 +15,7 @@
 from .externals import six
 from .externals.six.moves import map, range, zip
 from .utils import check_array
+from .utils.validation import check_is_fitted
 from .tree._tree import DTYPE
 
 from .ensemble._gradient_boosting import _partial_dependence_tree
@@ -310,9 +311,9 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
     if method == 'recursion':
-        if len(est.estimators_) == 0:
-            raise ValueError('Call %s.fit before partial_dependence' %
-                             est.__class__.__name__)
+        check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
+                                                'partial_dependence' %
+                                                est.__class__.__name__)
         n_features = est.n_features_
     elif X is None:
         raise ValueError('X is required for method="exact" or "estimated".')
@@ -490,9 +491,9 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
     if method == 'recursion':
-        if len(est.estimators_) == 0:
-            raise ValueError('Call %s.fit before partial_dependence' %
-                             est.__class__.__name__)
+        check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
+                                                'partial_dependence' %
+                                                est.__class__.__name__)
         n_features = est.n_features_
     elif X is None:
         raise ValueError('X is required for method="exact" or "estimated".')
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
new file mode 100644
index 0000000000000..13a12f4f809c8
--- /dev/null
+++ b/sklearn/tests/test_partial_dependence.py
@@ -0,0 +1,206 @@
+"""
+Testing for the partial dependence module.
+"""
+
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.utils.testing import assert_raises
+from sklearn.utils.testing import if_matplotlib
+from sklearn.partial_dependence import partial_dependence
+from sklearn.partial_dependence import plot_partial_dependence
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.ensemble import GradientBoostingRegressor
+from sklearn import datasets
+
+
+# toy sample
+X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
+y = [-1, -1, -1, 1, 1, 1]
+T = [[-1, -1], [2, 2], [3, 2]]
+true_result = [-1, 1, 1]
+
+# also load the boston dataset
+boston = datasets.load_boston()
+
+# also load the iris dataset
+iris = datasets.load_iris()
+
+
+def test_partial_dependence_classifier():
+    # Test partial dependence for classifier
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+
+    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)
+
+    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
+    assert pdp.shape == (1, 4)
+    assert axes[0].shape[0] == 4
+
+    # now with our own grid
+    X_ = np.asarray(X)
+    grid = np.unique(X_[:, 0])
+    pdp_2, axes = partial_dependence(clf, [0], grid=grid)
+
+    assert axes is None
+    assert_array_equal(pdp, pdp_2)
+
+
+def test_partial_dependence_multiclass():
+    # Test partial dependence for multi-class classifier
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    grid_resolution = 25
+    n_classes = clf.n_classes_
+    pdp, axes = partial_dependence(
+        clf, [0], X=iris.data, grid_resolution=grid_resolution)
+
+    assert pdp.shape == (n_classes, grid_resolution)
+    assert len(axes) == 1
+    assert axes[0].shape[0] == grid_resolution
+
+
+def test_partial_dependence_regressor():
+    # Test partial dependence for regressor
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+
+    grid_resolution = 25
+    pdp, axes = partial_dependence(
+        clf, [0], X=boston.data, grid_resolution=grid_resolution)
+
+    assert pdp.shape == (1, grid_resolution)
+    assert axes[0].shape[0] == grid_resolution
+
+
+def test_partial_dependecy_input():
+    # Test input validation of partial dependence.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(X, y)
+
+    assert_raises(ValueError, partial_dependence,
+                  clf, [0], grid=None, X=None)
+
+    assert_raises(ValueError, partial_dependence,
+                  clf, [0], grid=[0, 1], X=X)
+
+    # first argument must be an instance of BaseGradientBoosting
+    assert_raises(ValueError, partial_dependence,
+                  {}, [0], X=X)
+
+    # Gradient boosting estimator must be fit
+    assert_raises(ValueError, partial_dependence,
+                  GradientBoostingClassifier(), [0], X=X)
+
+    assert_raises(ValueError, partial_dependence, clf, [-1], X=X)
+
+    assert_raises(ValueError, partial_dependence, clf, [100], X=X)
+
+    # wrong ndim for grid
+    grid = np.random.rand(10, 2, 1)
+    assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
+
+
+@if_matplotlib
+def test_plot_partial_dependence():
+    # Test partial dependence plot function.
+    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
+    clf.fit(boston.data, boston.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, boston.data, [0, 1, (0, 1)],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with str features and array feature names
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=boston.feature_names)
+
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+    # check with list feature_names
+    feature_names = boston.feature_names.tolist()
+    fig, axs = plot_partial_dependence(clf, boston.data, ['CRIM', 'ZN',
+                                                          ('CRIM', 'ZN')],
+                                       grid_resolution=grid_resolution,
+                                       feature_names=feature_names)
+    assert len(axs) == 3
+    assert all(ax.has_data for ax in axs)
+
+
+@if_matplotlib
+def test_plot_partial_dependence_input():
+    # Test partial dependence plot function input checks.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+
+    # not fitted yet
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [0])
+
+    clf.fit(X, y)
+
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, np.array(X)[:, :0], [0])
+
+    # first argument must be an instance of BaseGradientBoosting
+    assert_raises(ValueError, plot_partial_dependence,
+                  {}, X, [0])
+
+    # must be larger than -1
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [-1])
+
+    # too large feature value
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [100])
+
+    # str feature but no feature_names
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, ['foobar'])
+
+    # not valid features value
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, X, [{'foo': 'bar'}])
+
+
+@if_matplotlib
+def test_plot_partial_dependence_multiclass():
+    # Test partial dependence plot function on multi-class input.
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, iris.target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       label=0,
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    # now with symbol labels
+    target = iris.target_names[iris.target]
+    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
+    clf.fit(iris.data, target)
+
+    grid_resolution = 25
+    fig, axs = plot_partial_dependence(clf, iris.data, [0, 1],
+                                       label='setosa',
+                                       grid_resolution=grid_resolution)
+    assert len(axs) == 2
+    assert all(ax.has_data for ax in axs)
+
+    # label not in gbrt.classes_
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, iris.data, [0, 1], label='foobar',
+                  grid_resolution=grid_resolution)
+
+    # label not provided
+    assert_raises(ValueError, plot_partial_dependence,
+                  clf, iris.data, [0, 1],
+                  grid_resolution=grid_resolution)

From 9095305f5ebaea0a51c0aff8797773e5ae6d661d Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 15 Aug 2017 22:29:24 +1000
Subject: [PATCH 10/23] refactor estimated and exact functions to _predict

---
 sklearn/partial_dependence.py | 173 ++++++++++++----------------------
 1 file changed, 59 insertions(+), 114 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index f4966b35f056d..cbc56fb775643 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -75,8 +75,8 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _exact_partial_dependence(est, target_variables, grid, X, output=None):
-    """Calculate the partial dependence of ``target_variables``.
+def _predict(est, X_eval, method, output=None):
+    """Calculate part of the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
     ``est`` for classification or ``predict`` for regression on ``X`` for every
@@ -86,142 +86,68 @@ def _exact_partial_dependence(est, target_variables, grid, X, output=None):
     ----------
     est : BaseEstimator
         A fitted classification or regression model.
-    target_variables : array-like, dtype=int
-        The target features for which the partial dependency should be
-        computed (size should be smaller than 3 for visual renderings).
-    grid : array-like, shape=(n_points, len(target_variables))
-        The grid of ``target_variables`` values for which the
-        partial dependency should be evaluated (either ``grid`` or ``X``
-        must be specified).
-    X : array-like, shape=(n_samples, n_features)
-        The data on which ``est`` was trained.
-    output : int, optional (default=None)
-        The output index to use for multi-output estimators.
-
-    Returns
-    -------
-    pdp : array, shape=(n_classes, n_points)
-        The partial dependence function evaluated on the ``grid``.
-        For regression and binary classification ``n_classes==1``.
-    """
-    n_samples = X.shape[0]
-    pdp = []
-    for row in range(grid.shape[0]):
-        X_eval = X.copy()
-        for i, variable in enumerate(target_variables):
-            X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
-        if est._estimator_type == 'regressor':
-            try:
-                pdp_row = est.predict(X_eval)
-            except NotFittedError:
-                raise ValueError('Call %s.fit before partial_dependence' %
-                                 est.__class__.__name__)
-            if pdp_row.ndim != 1 and pdp_row.shape[1] != 1:
-                # Multi-output
-                if not 0 <= output < pdp_row.shape[1]:
-                    raise ValueError('Valid output must be specified for '
-                                     'multi-output models.')
-                pdp_row = pdp_row[:, output]
-            pdp.append(np.mean(pdp_row))
-        elif est._estimator_type == 'classifier':
-            try:
-                pdp_row = est.predict_proba(X_eval)
-            except NotFittedError:
-                raise ValueError('Call %s.fit before partial_dependence' %
-                                 est.__class__.__name__)
-            if isinstance(pdp_row, list):
-                # Multi-output
-                if not 0 <= output < len(pdp_row):
-                    raise ValueError('Valid output must be specified for '
-                                     'multi-output models.')
-                pdp_row = pdp_row[output]
-            pdp_row = np.log(np.clip(pdp_row, 1e-16, 1))
-            pdp_row = np.subtract(pdp_row,
-                                  np.mean(pdp_row, 1)[:, np.newaxis])
-            pdp.append(np.mean(pdp_row, 0))
-        else:
-            raise ValueError('est must be a fitted regressor or classifier '
-                             'model.')
-    pdp = np.array(pdp).transpose()
-    if pdp.shape[0] == 2:
-        # Binary classification
-        pdp = pdp[1, :][np.newaxis]
-    elif len(pdp.shape) == 1:
-        # Regression
-        pdp = pdp[np.newaxis]
-    return pdp
-
-
-def _estimated_partial_dependence(est, target_variables, grid, X, output=None):
-    """Calculate the partial dependence of ``target_variables``.
-
-    The function will be calculated by calling the ``predict_proba`` method of
-    ``est`` for classification or ``predict`` for regression on the mean of
-    ``X``.
+    X_eval : array-like, shape=(n_samples, n_features)
+        The data on which the partial dependence of ``est`` should be
+        predicted.
+    method : {'exact', 'estimated'}
+        The method to use to calculate the partial dependence function:
 
-    Parameters
-    ----------
-    est : BaseEstimator
-        A fitted classification or regression model.
-    target_variables : array-like, dtype=int
-        The target features for which the partial dependency should be
-        computed (size should be smaller than 3 for visual renderings).
-    grid : array-like, shape=(n_points, len(target_variables))
-        The grid of ``target_variables`` values for which the
-        partial dependency should be evaluated (either ``grid`` or ``X``
-        must be specified).
-    X : array-like, shape=(n_samples, n_features)
-        The data on which ``est`` was trained.
+        - If 'exact', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on ``X``for every point in the grid. To speed up this
+          method, you can use a subset of ``X`` or a more coarse grid.
+        - If 'estimated', the function will be calculated by calling the
+          ``predict_proba`` method of ``est`` for classification or ``predict``
+          for regression on the mean of ``X``.
     output : int, optional (default=None)
         The output index to use for multi-output estimators.
 
     Returns
     -------
-    pdp : array, shape=(n_classes, n_points)
+    out : array, shape=(n_classes, n_points)
         The partial dependence function evaluated on the ``grid``.
         For regression and binary classification ``n_classes==1``.
     """
-    n_samples = grid.shape[0]
-    X_eval = np.tile(X.mean(0), [n_samples, 1])
-    for i, variable in enumerate(target_variables):
-        X_eval[:, variable] = grid[:, i]
     if est._estimator_type == 'regressor':
         try:
-            pdp = est.predict(X_eval)
+            out = est.predict(X_eval)
         except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
-        if pdp.ndim != 1 and pdp.shape[1] == 1:
+        if out.ndim != 1 and out.shape[1] == 1:
             # Column output
-            pdp = pdp.ravel()
-        if pdp.ndim != 1 and pdp.shape[1] != 1:
+            out = out.ravel()
+        if out.ndim != 1 and out.shape[1] != 1:
             # Multi-output
-            if not 0 <= output < pdp.shape[1]:
+            if not 0 <= output < out.shape[1]:
                 raise ValueError('Valid output must be specified for '
                                  'multi-output models.')
-            pdp = pdp[:, output]
-        pdp = pdp[np.newaxis]
+            out = out[:, output]
+        if method == 'exact':
+            return np.mean(out)
+        else:
+            return out[np.newaxis]
     elif est._estimator_type == 'classifier':
         try:
-            pdp = est.predict_proba(X_eval)
+            out = est.predict_proba(X_eval)
         except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
-        if isinstance(pdp, list):
+        if isinstance(out, list):
             # Multi-output
-            if not 0 <= output < len(pdp):
+            if not 0 <= output < len(out):
                 raise ValueError('Valid output must be specified for '
                                  'multi-output models.')
-            pdp = pdp[output]
-        pdp = np.log(np.clip(pdp, 1e-16, 1))
-        pdp = np.subtract(pdp, np.mean(pdp, 1)[:, np.newaxis])
-        pdp = pdp.transpose()
+            out = out[output]
+        out = np.log(np.clip(out, 1e-16, 1))
+        out = np.subtract(out, np.mean(out, 1)[:, np.newaxis])
+        if method == 'exact':
+            return np.mean(out, 0)
+        else:
+            return out.transpose()
     else:
-        raise ValueError('est must be a fitted regressor or classifier model.')
-    if pdp.shape[0] == 2:
-        # Binary classification
-        pdp = pdp[1, :][np.newaxis]
-    return pdp
+        raise ValueError('est must be a fitted regressor or classifier '
+                         'model.')
 
 
 def partial_dependence(est, target_variables, grid=None, X=None, output=None,
@@ -368,10 +294,29 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if isinstance(est, ForestRegressor):
             pdp /= n_estimators
     elif method == 'exact':
-        pdp = _exact_partial_dependence(est, target_variables, grid, X, output)
+        n_samples = X.shape[0]
+        pdp = []
+        for row in range(grid.shape[0]):
+            X_eval = X.copy()
+            for i, variable in enumerate(target_variables):
+                X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
+            pdp.append(_predict(est, X_eval, method, output=None))
+        pdp = np.array(pdp).transpose()
+        if pdp.shape[0] == 2:
+            # Binary classification
+            pdp = pdp[1, :][np.newaxis]
+        elif len(pdp.shape) == 1:
+            # Regression
+            pdp = pdp[np.newaxis]
     elif method == 'estimated':
-        pdp = _estimated_partial_dependence(est, target_variables, grid, X,
-                                            output)
+        n_samples = grid.shape[0]
+        X_eval = np.tile(X.mean(0), [n_samples, 1])
+        for i, variable in enumerate(target_variables):
+            X_eval[:, variable] = grid[:, i]
+        pdp = _predict(est, X_eval, method, output=None)
+        if pdp.shape[0] == 2:
+            # Binary classification
+            pdp = pdp[1, :][np.newaxis]
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)

From 3fc1727ecdb62002bbfba454fb11a09decda8479 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 15 Aug 2017 22:36:33 +1000
Subject: [PATCH 11/23] make "auto" the default rather than None for method

---
 sklearn/partial_dependence.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index cbc56fb775643..bc3d7a351db6c 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -152,7 +152,7 @@ def _predict(est, X_eval, method, output=None):
 
 def partial_dependence(est, target_variables, grid=None, X=None, output=None,
                        percentiles=(0.05, 0.95), grid_resolution=100,
-                       method=None):
+                       method='auto'):
     """Partial dependence of ``target_variables``.
 
     Partial dependence plots show the dependence between the joint values
@@ -184,7 +184,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         for the ``grid``. Only if ``X`` is not None.
     grid_resolution : int, default=100
         The number of equally spaced points on the ``grid``.
-    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
+    method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
@@ -197,7 +197,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         - If 'estimated', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on the mean of ``X``.
-        - If None, then 'recursion' will be used if ``est`` is
+        - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.
 
@@ -220,7 +220,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
-    if method is None:
+    if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
             method = 'recursion'
         else:
@@ -326,7 +326,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
 
 def plot_partial_dependence(est, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
-                            method=None, percentiles=(0.05, 0.95), n_jobs=1,
+                            method='auto', percentiles=(0.05, 0.95), n_jobs=1,
                             verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots for ``features``.
@@ -364,7 +364,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         for the PDP axes.
     grid_resolution : int, default=100
         The number of equally spaced points on the axes.
-    method : {'recursion', 'exact', 'estimated', None}, optional (default=None)
+    method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
         - If 'recursion', the underlying trees of ``est`` will be recursed to
@@ -377,7 +377,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         - If 'estimated', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
           for regression on the mean of ``X``.
-        - If None, then 'recursion' will be used if ``est`` is
+        - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.
     n_jobs : int
@@ -419,7 +419,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     from matplotlib.ticker import MaxNLocator
     from matplotlib.ticker import ScalarFormatter
 
-    if method is None:
+    if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
             method = 'recursion'
         else:

From 259ec9946927d331586def5b6ce4417f7d6e8fe8 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Wed, 16 Aug 2017 21:49:33 +1000
Subject: [PATCH 12/23] some more refactoring

---
 sklearn/partial_dependence.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index bc3d7a351db6c..a3af46443a0cb 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -114,19 +114,12 @@ def _predict(est, X_eval, method, output=None):
         except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
-        if out.ndim != 1 and out.shape[1] == 1:
-            # Column output
-            out = out.ravel()
         if out.ndim != 1 and out.shape[1] != 1:
             # Multi-output
             if not 0 <= output < out.shape[1]:
                 raise ValueError('Valid output must be specified for '
                                  'multi-output models.')
             out = out[:, output]
-        if method == 'exact':
-            return np.mean(out)
-        else:
-            return out[np.newaxis]
     elif est._estimator_type == 'classifier':
         try:
             out = est.predict_proba(X_eval)
@@ -141,13 +134,10 @@ def _predict(est, X_eval, method, output=None):
             out = out[output]
         out = np.log(np.clip(out, 1e-16, 1))
         out = np.subtract(out, np.mean(out, 1)[:, np.newaxis])
-        if method == 'exact':
-            return np.mean(out, 0)
-        else:
-            return out.transpose()
     else:
         raise ValueError('est must be a fitted regressor or classifier '
                          'model.')
+    return out
 
 
 def partial_dependence(est, target_variables, grid=None, X=None, output=None,
@@ -308,6 +298,10 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         elif len(pdp.shape) == 1:
             # Regression
             pdp = pdp[np.newaxis]
+        if est._estimator_type == 'regressor':
+            pdp = np.mean(pdp)
+        else:
+            pdp = np.mean(pdp, 0)
     elif method == 'estimated':
         n_samples = grid.shape[0]
         X_eval = np.tile(X.mean(0), [n_samples, 1])
@@ -317,6 +311,10 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if pdp.shape[0] == 2:
             # Binary classification
             pdp = pdp[1, :][np.newaxis]
+        if est._estimator_type == 'regressor':
+            pdp = pdp[np.newaxis]
+        else:
+            pdp = pdp.transpose()
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)

From cbc20af0448f219f73cbebd90f39af3b20571de4 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Mon, 21 Aug 2017 18:52:01 +1000
Subject: [PATCH 13/23] avoid namespace collision

---
 sklearn/partial_dependence.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index a3af46443a0cb..2dba915cb98a4 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -18,9 +18,6 @@
 from .utils.validation import check_is_fitted
 from .tree._tree import DTYPE
 
-from .ensemble._gradient_boosting import _partial_dependence_tree
-from .ensemble.gradient_boosting import BaseGradientBoosting
-from .ensemble.forest import ForestRegressor
 from .exceptions import NotFittedError
 
 
@@ -210,6 +207,12 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     >>> partial_dependence(gb, [0], **kwargs) # doctest: +SKIP
     (array([[-4.52...,  4.52...]]), [array([ 0.,  1.])])
     """
+    # TODO: The pattern below required to avoid a namespace collision.
+    # TODO: Move below imports to module level import at 0.22 release.
+    from .ensemble._gradient_boosting import _partial_dependence_tree
+    from .ensemble.gradient_boosting import BaseGradientBoosting
+    from .ensemble.forest import ForestRegressor
+
     if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
             method = 'recursion'
@@ -223,7 +226,8 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     if (not hasattr(est, '_estimator_type') or
             est._estimator_type not in ('classifier', 'regressor')):
         raise ValueError('est must be a fitted regressor or classifier model.')
-    if method != 'recursion' and est._estimator_type == 'classifier':
+    if (method != 'recursion' and est._estimator_type == 'classifier' and
+            not hasattr(est, 'predict_proba')):
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
     if method == 'recursion':
@@ -416,6 +420,10 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     from matplotlib import transforms
     from matplotlib.ticker import MaxNLocator
     from matplotlib.ticker import ScalarFormatter
+    # TODO: The pattern below required to avoid a namespace collision.
+    # TODO: Move below imports to module level import at 0.22 release.
+    from .ensemble.gradient_boosting import BaseGradientBoosting
+    from .ensemble.forest import ForestRegressor
 
     if method == 'auto':
         if isinstance(est, (BaseGradientBoosting, ForestRegressor)):
@@ -455,7 +463,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         label_idx = 0
 
     X = check_array(X, dtype=DTYPE, order='C')
-    if est.n_features_ != X.shape[1]:
+    if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]:
         raise ValueError('X.shape[1] does not match est.n_features_')
 
     if line_kw is None:

From 63da115c613b4a6fbaf787bc51832863ad1b85fb Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 22 Aug 2017 21:12:43 +1000
Subject: [PATCH 14/23] fix output shapes of all estimators

---
 sklearn/partial_dependence.py | 33 ++++++++++++---------------------
 1 file changed, 12 insertions(+), 21 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 2dba915cb98a4..2582affad5714 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -72,7 +72,7 @@ def _grid_from_X(X, percentiles=(0.05, 0.95), grid_resolution=100):
     return cartesian(axes), axes
 
 
-def _predict(est, X_eval, method, output=None):
+def _predict(est, X_eval, output=None):
     """Calculate part of the partial dependence of ``target_variables``.
 
     The function will be calculated by calling the ``predict_proba`` method of
@@ -86,16 +86,6 @@ def _predict(est, X_eval, method, output=None):
     X_eval : array-like, shape=(n_samples, n_features)
         The data on which the partial dependence of ``est`` should be
         predicted.
-    method : {'exact', 'estimated'}
-        The method to use to calculate the partial dependence function:
-
-        - If 'exact', the function will be calculated by calling the
-          ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on ``X``for every point in the grid. To speed up this
-          method, you can use a subset of ``X`` or a more coarse grid.
-        - If 'estimated', the function will be calculated by calling the
-          ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on the mean of ``X``.
     output : int, optional (default=None)
         The output index to use for multi-output estimators.
 
@@ -111,6 +101,9 @@ def _predict(est, X_eval, method, output=None):
         except NotFittedError:
             raise ValueError('Call %s.fit before partial_dependence' %
                              est.__class__.__name__)
+        if out.ndim != 1 and out.shape[1] == 1:
+            # Column output
+            out = out.ravel()
         if out.ndim != 1 and out.shape[1] != 1:
             # Multi-output
             if not 0 <= output < out.shape[1]:
@@ -294,7 +287,11 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
             X_eval = X.copy()
             for i, variable in enumerate(target_variables):
                 X_eval[:, variable] = np.repeat(grid[row, i], n_samples)
-            pdp.append(_predict(est, X_eval, method, output=None))
+            pdp_row = _predict(est, X_eval, output=output)
+            if est._estimator_type == 'regressor':
+                pdp.append(np.mean(pdp_row))
+            else:
+                pdp.append(np.mean(pdp_row, 0))
         pdp = np.array(pdp).transpose()
         if pdp.shape[0] == 2:
             # Binary classification
@@ -302,23 +299,17 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         elif len(pdp.shape) == 1:
             # Regression
             pdp = pdp[np.newaxis]
-        if est._estimator_type == 'regressor':
-            pdp = np.mean(pdp)
-        else:
-            pdp = np.mean(pdp, 0)
     elif method == 'estimated':
         n_samples = grid.shape[0]
         X_eval = np.tile(X.mean(0), [n_samples, 1])
         for i, variable in enumerate(target_variables):
             X_eval[:, variable] = grid[:, i]
-        pdp = _predict(est, X_eval, method, output=None)
-        if pdp.shape[0] == 2:
+        pdp = _predict(est, X_eval, output=output)
+        if pdp.shape[1] == 2:
             # Binary classification
-            pdp = pdp[1, :][np.newaxis]
+            pdp = pdp[:, 1][np.newaxis]
         if est._estimator_type == 'regressor':
             pdp = pdp[np.newaxis]
-        else:
-            pdp = pdp.transpose()
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)

From 8f7d2b0a9e44de42c5d019fab6302aba9a934baa Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Tue, 22 Aug 2017 22:33:42 +1000
Subject: [PATCH 15/23] add tests to ensure all estimators output same shape

---
 sklearn/partial_dependence.py            |  3 +-
 sklearn/tests/test_partial_dependence.py | 57 ++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 2582affad5714..b9401cec77dd7 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -305,7 +305,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         for i, variable in enumerate(target_variables):
             X_eval[:, variable] = grid[:, i]
         pdp = _predict(est, X_eval, output=output)
-        if pdp.shape[1] == 2:
+        if est._estimator_type == 'classifier' and pdp.shape[1] == 2:
             # Binary classification
             pdp = pdp[:, 1][np.newaxis]
         if est._estimator_type == 'regressor':
@@ -313,7 +313,6 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
     else:
         raise ValueError('method "%s" is invalid. Use "recursion", "exact", '
                          '"estimated", or None.' % method)
-
     return pdp, axes
 
 
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 13a12f4f809c8..ce3d8a99a05eb 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -7,24 +7,73 @@
 
 from sklearn.utils.testing import assert_raises
 from sklearn.utils.testing import if_matplotlib
+from sklearn.utils.testing import all_estimators
+from sklearn.utils.testing import ignore_warnings
 from sklearn.partial_dependence import partial_dependence
 from sklearn.partial_dependence import plot_partial_dependence
 from sklearn.ensemble import GradientBoostingClassifier
 from sklearn.ensemble import GradientBoostingRegressor
+from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
+from sklearn.ensemble.forest import ForestRegressor
 from sklearn import datasets
 
-
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
 y = [-1, -1, -1, 1, 1, 1]
 T = [[-1, -1], [2, 2], [3, 2]]
 true_result = [-1, 1, 1]
 
-# also load the boston dataset
+# Load the boston, iris & breast cancer datasets
 boston = datasets.load_boston()
-
-# also load the iris dataset
 iris = datasets.load_iris()
+breast_cancer = datasets.load_breast_cancer()
+
+
+@ignore_warnings()
+def test_output_shape_classifier():
+    # Test that partial_dependence has same output shape for all classifiers
+    for name, Estimator in all_estimators():
+        clf = Estimator()
+        if (not hasattr(clf, '_estimator_type') or
+                'MultiTask' in name or
+                clf._estimator_type != 'classifier' or
+                not hasattr(clf, 'predict_proba')):
+            continue
+        clf.fit(breast_cancer.data, breast_cancer.target)
+        for method in ['recursion', 'exact', 'estimated']:
+            if (method == 'recursion' and not
+                    (isinstance(clf, BaseGradientBoosting) or
+                     isinstance(clf, ForestRegressor))):
+                continue
+            pdp, axes = partial_dependence(clf,
+                                           target_variables=[1],
+                                           X=breast_cancer.data,
+                                           method=method,
+                                           grid_resolution=20)
+            assert(pdp.shape == (1, 20))
+
+
+@ignore_warnings()
+def test_output_shape_regressor():
+    # Test that partial_dependence has same output shape for all regressors
+    for name, Estimator in all_estimators():
+        clf = Estimator()
+        if (not hasattr(clf, '_estimator_type') or
+                'MultiTask' in name or
+                clf._estimator_type != 'regressor'):
+            continue
+        clf.fit(boston.data, boston.target)
+        for method in ['recursion', 'exact', 'estimated']:
+            if (method == 'recursion' and not
+                    (isinstance(clf, BaseGradientBoosting) or
+                     isinstance(clf, ForestRegressor))):
+                continue
+            pdp, axes = partial_dependence(clf,
+                                           target_variables=[1],
+                                           X=boston.data,
+                                           method=method,
+                                           grid_resolution=20)
+            assert(pdp.shape == (1, 20))
 
 
 def test_partial_dependence_classifier():

From 6fc3a497e861f72695d69dd1ee5d677445888161 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Wed, 30 Aug 2017 20:25:18 +1000
Subject: [PATCH 16/23] quick fixes

---
 sklearn/partial_dependence.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index b9401cec77dd7..7e6635e4120e1 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -428,7 +428,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     if (not hasattr(est, '_estimator_type') or
             est._estimator_type not in ('classifier', 'regressor')):
         raise ValueError('est must be a fitted regressor or classifier model.')
-    if method != 'recursion' and est._estimator_type == 'classifier':
+    if (method != 'recursion' and est._estimator_type == 'classifier' and
+            not hasattr(est, 'predict_proba')):
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
     if method == 'recursion':
@@ -464,7 +465,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     # convert feature_names to list
     if feature_names is None:
         # if not feature_names use fx indices as name
-        feature_names = [str(i) for i in range(est.n_features_)]
+        feature_names = [str(i) for i in range(n_features)]
     elif isinstance(feature_names, np.ndarray):
         feature_names = feature_names.tolist()
 

From b1f8bfcddd49e7eb7ba71e30b7d4bb8b24ea9056 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Fri, 1 Sep 2017 20:31:12 +1000
Subject: [PATCH 17/23] fix docstring, test fails

---
 sklearn/partial_dependence.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 7e6635e4120e1..91d7ac234bc7c 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -176,7 +176,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
           method, you can use a subset of ``X`` or a more coarse grid.
         - If 'estimated', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on the mean of ``X``.
+          for regression on the median of ``X``.
         - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.
@@ -301,7 +301,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
             pdp = pdp[np.newaxis]
     elif method == 'estimated':
         n_samples = grid.shape[0]
-        X_eval = np.tile(X.mean(0), [n_samples, 1])
+        X_eval = np.tile(np.median(X, 0), [n_samples, 1])
         for i, variable in enumerate(target_variables):
             X_eval[:, variable] = grid[:, i]
         pdp = _predict(est, X_eval, output=output)
@@ -318,7 +318,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
 
 def plot_partial_dependence(est, X, features, feature_names=None,
                             label=None, n_cols=3, grid_resolution=100,
-                            method='auto', percentiles=(0.05, 0.95), n_jobs=1,
+                            percentiles=(0.05, 0.95), method='auto', n_jobs=1,
                             verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
     """Partial dependence plots for ``features``.
@@ -351,11 +351,11 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         Only if est is a multi-class model. Must be in ``est.classes_``.
     n_cols : int
         The number of columns in the grid plot (default: 3).
+    grid_resolution : int, default=100
+        The number of equally spaced points on the axes.
     percentiles : (low, high), default=(0.05, 0.95)
         The lower and upper percentile used to create the extreme values
         for the PDP axes.
-    grid_resolution : int, default=100
-        The number of equally spaced points on the axes.
     method : {'recursion', 'exact', 'estimated', 'auto'}, default='auto'
         The method to use to calculate the partial dependence function:
 
@@ -368,7 +368,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
           method, you can use a subset of ``X`` or a more coarse grid.
         - If 'estimated', the function will be calculated by calling the
           ``predict_proba`` method of ``est`` for classification or ``predict``
-          for regression on the mean of ``X``.
+          for regression on the median of ``X``.
         - If 'auto', then 'recursion' will be used if ``est`` is
           BaseGradientBoosting or ForestRegressor, and 'exact' used for other
           estimators.

From dc93b694aeb41f220203c552dfd5dbd07f049937 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Fri, 1 Sep 2017 21:27:05 +1000
Subject: [PATCH 18/23] refactor tests for easier debugging

---
 sklearn/partial_dependence.py            |  2 +-
 sklearn/tests/test_partial_dependence.py | 97 ++++++++++++++++--------
 2 files changed, 67 insertions(+), 32 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 91d7ac234bc7c..7a51c35793bfc 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -385,7 +385,7 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     contour_kw : dict
         Dict with keywords passed to the ``matplotlib.pyplot.plot`` call.
         For two-way partial dependence plots.
-    fig_kw : dict
+    **fig_kw : dict
         Dict with keywords passed to the figure() call.
         Note that all keywords not recognized above will be automatically
         included here.
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index ce3d8a99a05eb..4d7855cae64aa 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -29,51 +29,86 @@
 breast_cancer = datasets.load_breast_cancer()
 
 
+def test_output_shape_recursion():
+    # Test recursion partial_dependence has same output shape for everything
+    for name, Estimator in all_estimators():
+        est = Estimator()
+        if not (isinstance(est, BaseGradientBoosting) or
+                isinstance(est, ForestRegressor)):
+            continue
+        if est._estimator_type == 'classifier':
+            est.fit(breast_cancer.data, breast_cancer.target)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=breast_cancer.data,
+                                           method='recursion',
+                                           grid_resolution=10)
+        elif est._estimator_type == 'regressor':
+            est.fit(boston.data, boston.target)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=boston.data,
+                                           method='recursion',
+                                           grid_resolution=10)
+        else:
+            continue
+        assert(pdp.shape == (1, 10))
+
+
 @ignore_warnings()
-def test_output_shape_classifier():
-    # Test that partial_dependence has same output shape for all classifiers
+def test_output_shape_exact():
+    # Test exact partial_dependence has same output shape for everything
     for name, Estimator in all_estimators():
-        clf = Estimator()
-        if (not hasattr(clf, '_estimator_type') or
-                'MultiTask' in name or
-                clf._estimator_type != 'classifier' or
-                not hasattr(clf, 'predict_proba')):
+        est = Estimator()
+        if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
             continue
-        clf.fit(breast_cancer.data, breast_cancer.target)
-        for method in ['recursion', 'exact', 'estimated']:
-            if (method == 'recursion' and not
-                    (isinstance(clf, BaseGradientBoosting) or
-                     isinstance(clf, ForestRegressor))):
+        if est._estimator_type == 'classifier':
+            if not hasattr(est, 'predict_proba'):
                 continue
-            pdp, axes = partial_dependence(clf,
+            est.fit(breast_cancer.data, breast_cancer.target)
+            pdp, axes = partial_dependence(est,
                                            target_variables=[1],
                                            X=breast_cancer.data,
-                                           method=method,
-                                           grid_resolution=20)
-            assert(pdp.shape == (1, 20))
+                                           method='exact',
+                                           grid_resolution=10)
+        elif est._estimator_type == 'regressor':
+            est.fit(boston.data, boston.target)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=boston.data,
+                                           method='exact',
+                                           grid_resolution=10)
+        else:
+            continue
+        assert(pdp.shape == (1, 10))
 
 
 @ignore_warnings()
-def test_output_shape_regressor():
-    # Test that partial_dependence has same output shape for all regressors
+def test_output_shape_estimated():
+    # Test exact partial_dependence has same output shape for everything
     for name, Estimator in all_estimators():
-        clf = Estimator()
-        if (not hasattr(clf, '_estimator_type') or
-                'MultiTask' in name or
-                clf._estimator_type != 'regressor'):
+        est = Estimator()
+        if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
             continue
-        clf.fit(boston.data, boston.target)
-        for method in ['recursion', 'exact', 'estimated']:
-            if (method == 'recursion' and not
-                    (isinstance(clf, BaseGradientBoosting) or
-                     isinstance(clf, ForestRegressor))):
+        if est._estimator_type == 'classifier':
+            if not hasattr(est, 'predict_proba'):
                 continue
-            pdp, axes = partial_dependence(clf,
+            est.fit(breast_cancer.data, breast_cancer.target)
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1],
+                                           X=breast_cancer.data,
+                                           method='estimated',
+                                           grid_resolution=10)
+        elif est._estimator_type == 'regressor':
+            est.fit(boston.data, boston.target)
+            pdp, axes = partial_dependence(est,
                                            target_variables=[1],
                                            X=boston.data,
-                                           method=method,
-                                           grid_resolution=20)
-            assert(pdp.shape == (1, 20))
+                                           method='estimated',
+                                           grid_resolution=10)
+        else:
+            continue
+        assert(pdp.shape == (1, 10))
 
 
 def test_partial_dependence_classifier():

From cd8f8de65b3c0d17de87abf9f228a126f4039ab9 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 2 Sep 2017 11:48:54 +1000
Subject: [PATCH 19/23] speed up tests, add two-way plot test

---
 sklearn/tests/test_partial_dependence.py | 96 +++++++++++++++++-------
 1 file changed, 68 insertions(+), 28 deletions(-)

diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 4d7855cae64aa..46d12132a27ad 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -15,7 +15,8 @@
 from sklearn.ensemble import GradientBoostingRegressor
 from sklearn.ensemble.gradient_boosting import BaseGradientBoosting
 from sklearn.ensemble.forest import ForestRegressor
-from sklearn import datasets
+from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import make_classification, make_regression
 
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
@@ -23,36 +24,53 @@
 T = [[-1, -1], [2, 2], [3, 2]]
 true_result = [-1, 1, 1]
 
-# Load the boston, iris & breast cancer datasets
-boston = datasets.load_boston()
-iris = datasets.load_iris()
-breast_cancer = datasets.load_breast_cancer()
+# Make some sample data to test output shapes
+X_c, y_c = make_classification(n_features=10, n_informative=5, random_state=0)
+# Non-negative for MultinomialNB
+X_c = X_c + np.abs(X_c.min())
+X_r, y_r = make_regression(n_features=10, n_informative=5, random_state=0)
 
+# Load the boston & iris datasets
+boston = load_boston()
+iris = load_iris()
 
+
+@ignore_warnings()
 def test_output_shape_recursion():
     # Test recursion partial_dependence has same output shape for everything
     for name, Estimator in all_estimators():
         est = Estimator()
         if not (isinstance(est, BaseGradientBoosting) or
-                isinstance(est, ForestRegressor)):
+                    isinstance(est, ForestRegressor)):
             continue
         if est._estimator_type == 'classifier':
-            est.fit(breast_cancer.data, breast_cancer.target)
+            est.fit(X_c, y_c)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=breast_cancer.data,
+                                           X=X_c,
+                                           method='recursion',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_c,
                                            method='recursion',
                                            grid_resolution=10)
+            assert (pdp.shape == (1, 100))
         elif est._estimator_type == 'regressor':
-            est.fit(boston.data, boston.target)
+            est.fit(X_r, y_r)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=boston.data,
+                                           X=X_r,
                                            method='recursion',
                                            grid_resolution=10)
-        else:
-            continue
-        assert(pdp.shape == (1, 10))
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='recursion',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
 
 
 @ignore_warnings()
@@ -65,27 +83,38 @@ def test_output_shape_exact():
         if est._estimator_type == 'classifier':
             if not hasattr(est, 'predict_proba'):
                 continue
-            est.fit(breast_cancer.data, breast_cancer.target)
+            est.fit(X_c, y_c)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=breast_cancer.data,
+                                           X=X_c,
+                                           method='exact',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_c,
                                            method='exact',
                                            grid_resolution=10)
+            assert (pdp.shape == (1, 100))
         elif est._estimator_type == 'regressor':
-            est.fit(boston.data, boston.target)
+            est.fit(X_r, y_r)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=boston.data,
+                                           X=X_r,
                                            method='exact',
                                            grid_resolution=10)
-        else:
-            continue
-        assert(pdp.shape == (1, 10))
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='exact',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
 
 
 @ignore_warnings()
 def test_output_shape_estimated():
-    # Test exact partial_dependence has same output shape for everything
+    # Test estimated partial_dependence has same output shape for everything
     for name, Estimator in all_estimators():
         est = Estimator()
         if not hasattr(est, '_estimator_type') or 'MultiTask' in name:
@@ -93,22 +122,33 @@ def test_output_shape_estimated():
         if est._estimator_type == 'classifier':
             if not hasattr(est, 'predict_proba'):
                 continue
-            est.fit(breast_cancer.data, breast_cancer.target)
+            est.fit(X_c, y_c)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=breast_cancer.data,
+                                           X=X_c,
+                                           method='estimated',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
                                            method='estimated',
                                            grid_resolution=10)
+            assert (pdp.shape == (1, 100))
         elif est._estimator_type == 'regressor':
-            est.fit(boston.data, boston.target)
+            est.fit(X_r, y_r)
             pdp, axes = partial_dependence(est,
                                            target_variables=[1],
-                                           X=boston.data,
+                                           X=X_r,
                                            method='estimated',
                                            grid_resolution=10)
-        else:
-            continue
-        assert(pdp.shape == (1, 10))
+            assert (pdp.shape == (1, 10))
+            pdp, axes = partial_dependence(est,
+                                           target_variables=[1, 2],
+                                           X=X_r,
+                                           method='estimated',
+                                           grid_resolution=10)
+            assert (pdp.shape == (1, 100))
 
 
 def test_partial_dependence_classifier():

From 4eb1a8081a10438b464c8a1e259100b07faa4038 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 2 Sep 2017 12:37:24 +1000
Subject: [PATCH 20/23] move input validation on X

---
 sklearn/partial_dependence.py            | 6 ++++--
 sklearn/tests/test_partial_dependence.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 7a51c35793bfc..78bbb617af15d 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -223,6 +223,8 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
             not hasattr(est, 'predict_proba')):
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
     if method == 'recursion':
         check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
                                                 'partial_dependence' %
@@ -243,7 +245,6 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
                          % (n_features - 1))
 
     if X is not None:
-        X = check_array(X, dtype=DTYPE, order='C')
         grid, axes = _grid_from_X(X[:, target_variables], percentiles,
                                   grid_resolution)
     else:
@@ -432,6 +433,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
             not hasattr(est, 'predict_proba')):
         raise ValueError('est requires a predict_proba method for '
                          'method="exact" or "estimated" for classification.')
+    if X is not None:
+        X = check_array(X, dtype=DTYPE, order='C')
     if method == 'recursion':
         check_is_fitted(est, 'estimators_', msg='Call %s.fit before '
                                                 'partial_dependence' %
@@ -453,7 +456,6 @@ def plot_partial_dependence(est, X, features, feature_names=None,
         # regression and binary classification
         label_idx = 0
 
-    X = check_array(X, dtype=DTYPE, order='C')
     if hasattr(est, 'n_features_') and est.n_features_ != X.shape[1]:
         raise ValueError('X.shape[1] does not match est.n_features_')
 
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 46d12132a27ad..4c50efa67977e 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -41,7 +41,7 @@ def test_output_shape_recursion():
     for name, Estimator in all_estimators():
         est = Estimator()
         if not (isinstance(est, BaseGradientBoosting) or
-                    isinstance(est, ForestRegressor)):
+                isinstance(est, ForestRegressor)):
             continue
         if est._estimator_type == 'classifier':
             est.fit(X_c, y_c)

From 21544ce03cd709ff4294a4147a9f245d00857a84 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sat, 28 Oct 2017 13:56:52 +1100
Subject: [PATCH 21/23] fix output shape for multi-label classification

---
 sklearn/partial_dependence.py            | 7 +++++--
 sklearn/tests/test_partial_dependence.py | 2 --
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 78bbb617af15d..7e843dd4469eb 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -1,4 +1,4 @@
-"""Partial dependence plots for regression and classification models. """
+"""Partial dependence plots for regression and classification models."""
 
 # Authors: Peter Prettenhofer
 #          Trevor Stephens
@@ -297,7 +297,7 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if pdp.shape[0] == 2:
             # Binary classification
             pdp = pdp[1, :][np.newaxis]
-        elif len(pdp.shape) == 1:
+        elif pdp.ndim == 1:
             # Regression
             pdp = pdp[np.newaxis]
     elif method == 'estimated':
@@ -309,6 +309,9 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
         if est._estimator_type == 'classifier' and pdp.shape[1] == 2:
             # Binary classification
             pdp = pdp[:, 1][np.newaxis]
+        elif est._estimator_type == 'classifier' and pdp.shape[1] > 2:
+            # Multi-label classification
+            pdp = pdp.T
         if est._estimator_type == 'regressor':
             pdp = pdp[np.newaxis]
     else:
diff --git a/sklearn/tests/test_partial_dependence.py b/sklearn/tests/test_partial_dependence.py
index 4c50efa67977e..aaa2e539cf594 100644
--- a/sklearn/tests/test_partial_dependence.py
+++ b/sklearn/tests/test_partial_dependence.py
@@ -21,8 +21,6 @@
 # toy sample
 X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
 y = [-1, -1, -1, 1, 1, 1]
-T = [[-1, -1], [2, 2], [3, 2]]
-true_result = [-1, 1, 1]
 
 # Make some sample data to test output shapes
 X_c, y_c = make_classification(n_features=10, n_informative=5, random_state=0)

From 610b5c572970fd5ecffad783036d9ff299196828 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 29 Oct 2017 08:47:10 +1100
Subject: [PATCH 22/23] update plot helper to support multi-output

---
 sklearn/partial_dependence.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 7e843dd4469eb..932c7d39d4a9e 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -321,7 +321,8 @@ def partial_dependence(est, target_variables, grid=None, X=None, output=None,
 
 
 def plot_partial_dependence(est, X, features, feature_names=None,
-                            label=None, n_cols=3, grid_resolution=100,
+                            label=None, output=None,
+                            n_cols=3, grid_resolution=100,
                             percentiles=(0.05, 0.95), method='auto', n_jobs=1,
                             verbose=0, ax=None, line_kw=None,
                             contour_kw=None, **fig_kw):
@@ -353,6 +354,8 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     label : object
         The class label for which the PDPs should be computed.
         Only if est is a multi-class model. Must be in ``est.classes_``.
+    output : int, optional (default=None)
+        The output index to use for multi-output estimators.
     n_cols : int
         The number of columns in the grid plot (default: 3).
     grid_resolution : int, default=100
@@ -448,13 +451,27 @@ def plot_partial_dependence(est, X, features, feature_names=None,
     else:
         n_features = X.shape[1]
 
-    # set label_idx for multi-class GBRT
+    # set label_idx for multi-class estimators
     if hasattr(est, 'classes_') and np.size(est.classes_) > 2:
         if label is None:
             raise ValueError('label is not given for multi-class PDP')
-        label_idx = np.searchsorted(est.classes_, label)
-        if est.classes_[label_idx] != label:
-            raise ValueError('label %s not in ``gbrt.classes_``' % str(label))
+        if type(est.classes_) == list:
+            # multi-output classification
+            if output is None:
+                raise ValueError('output is required for multi-output '
+                                 'estimators')
+            if output > len(est.classes_):
+                raise ValueError('output %d exceeds number of outputs in est, '
+                                 '%d' % (output, len(est.classes_)))
+            label_idx = np.searchsorted(est.classes_[output], label)
+            if est.classes_[output][label_idx] != label:
+                raise ValueError('label %s not in ``est.classes_``' %
+                                 str(label))
+        else:
+            label_idx = np.searchsorted(est.classes_, label)
+            if est.classes_[label_idx] != label:
+                raise ValueError('label %s not in ``est.classes_``' %
+                                 str(label))
     else:
         # regression and binary classification
         label_idx = 0

From dcbd0c6c9c86f754c3689e83a137c2252bc86646 Mon Sep 17 00:00:00 2001
From: trevorstephens <trev.stephens@gmail.com>
Date: Sun, 29 Oct 2017 09:45:45 +1100
Subject: [PATCH 23/23] update plot helper to pass-through output

---
 sklearn/partial_dependence.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sklearn/partial_dependence.py b/sklearn/partial_dependence.py
index 932c7d39d4a9e..436bc6f2b1134 100644
--- a/sklearn/partial_dependence.py
+++ b/sklearn/partial_dependence.py
@@ -531,7 +531,8 @@ def convert_feature(fx):
 
     # compute PD functions
     pd_result = Parallel(n_jobs=n_jobs, verbose=verbose)(
-        delayed(partial_dependence)(est, fxs, X=X, method=method,
+        delayed(partial_dependence)(est, fxs, X=X, output=output,
+                                    method=method,
                                     grid_resolution=grid_resolution,
                                     percentiles=percentiles)
         for fxs in features)