scikit-learn · thomasjpfan · Apr 13, 2020 · Apr 13, 2020 · Apr 13, 2020 · Apr 13, 2020
diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py
@@ -0,0 +1,88 @@
+import argparse
+from time import time
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import fetch_openml
+from sklearn.metrics import accuracy_score, roc_auc_score
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingClassifier
+from sklearn.ensemble._hist_gradient_boosting.utils import (
+    get_equivalent_estimator)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--n-leaf-nodes', type=int, default=31)
+parser.add_argument('--n-trees', type=int, default=40)
+parser.add_argument('--lightgbm', action="store_true", default=False)
+parser.add_argument('--learning-rate', type=float, default=1.)
+parser.add_argument('--max-bins', type=int, default=255)
+parser.add_argument('--no-predict', action="store_true", default=False)
+args = parser.parse_args()
+
+n_leaf_nodes = args.n_leaf_nodes
+n_trees = args.n_trees
+lr = args.learning_rate
+max_bins = args.max_bins
+
+
+def fit(est, data_train, target_train, libname, **fit_params):
+    print(f"Fitting a {libname} model...")
+    tic = time()
+    est.fit(data_train, target_train, **fit_params)
+    toc = time()
+    print(f"fitted in {toc - tic:.3f}s")
+
+
+def predict(est, data_test, target_test):
+    if args.no_predict:
+        return
+    tic = time()
+    predicted_test = est.predict(data_test)
+    predicted_proba_test = est.predict_proba(data_test)
+    toc = time()
+    roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1])
+    acc = accuracy_score(target_test, predicted_test)
+    print(f"predicted in {toc - tic:.3f}s, "
+          f"ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
+
+
+data, target = fetch_openml(data_id=179, as_frame=True, return_X_y=True)
+
+# does not support categories in encoding y yet
+target = target.cat.codes
+
+n_features = data.shape[1]
+is_categorical = data.dtypes == 'category'
+n_categorical_features = is_categorical.sum()
+n_numerical_features = (data.dtypes == 'float').sum()
+print(f"Number of features: {data.shape[1]}")
+print(f"Number of categorical features: {n_categorical_features}")
+print(f"Number of numerical features: {n_numerical_features}")
+
+categorical_features = np.flatnonzero(is_categorical)
+for i in categorical_features:
+    data.iloc[:, i] = data.iloc[:, i].cat.codes
+
+data_train, data_test, target_train, target_test = train_test_split(
+    data, target, test_size=.2, random_state=0)
+
+est = HistGradientBoostingClassifier(loss='binary_crossentropy',
+                                     learning_rate=lr,
+                                     max_iter=n_trees,
+                                     max_bins=max_bins,
+                                     categorical_features=categorical_features,
+                                     max_leaf_nodes=n_leaf_nodes,
+                                     early_stopping=False,
+                                     random_state=0,
+                                     verbose=1)
+
+fit(est, data_train, target_train, 'sklearn')
+predict(est, data_test, target_test)
+
+# lightgbm infers the categories from the dtype
+if args.lightgbm:
+    est = get_equivalent_estimator(est, lib='lightgbm')
+    fit(est, data_train, target_train, 'lightgbm',
+        categorical_feature=is_categorical[is_categorical].index.tolist())
+    predict(est, data_test, target_test)
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -1051,6 +1051,43 @@ multiplying the gradients (and the hessians) by the sample weights. Note that
 the binning stage (specifically the quantiles computation) does not take the
 weights into account.
 
+.. _categorical_support_gbdt:
+
+Categorical Features Support
+----------------------------
+
+For datasets with categorical data, :class:`HistGradientBoostingClassifier`
+and :class:`HistGradientBoostingRegressor` have native support for splitting
+on categorical features. This is often better than one hot encoding because
+it leads to faster training times and trees with less depth. The canonical way
+of considering categorical splits is to consider all of the :math:`2^{K - 1} -
+1` partitions where `K` is the number of categories. This can quickly become
+prohibitive when `K` is large.  Fortunately, since gradient boosting trees are
+always regression trees (even for classification problems), there exist a
+faster strategy that can yield equivalent splits. First, the categories of a
+feature are sorted according to the ratio `sum_gradient_k / sum_hessians_k` of
+each category `k`. Once the categories are sorted, one can consider *continuous
+partitions*, i.e. treat the categories as if they were ordered continuous
+values (see Fisher [Fisher1958]_ for a formal proof). As a result, only `K - 1`
+splits need to be considered instead of :math:`2^{K - 1} - 1`.
+
+If there are missing values during training, the missing values will be
+considered as a single category. When predicting, categories that were unknown
+during fit time, will be consider missing. If the cardinality of a categorical
+feature is greater than `max_bins`, then the top `max_bins` categories based on
+cardinality will be kept, and the less frequent categories will be considered
+as missing.
+
+To enable categorical support, a boolean mask can be passed to the
+`categorical_features` parameter. In the following, the first feature will be
+treated as categorical and the second feature as nummerical::
-treated as categorical and the second feature as nummerical::
+treated as categorical and the second feature as numerical::
-treated as categorical and the second feature as nummerical::
+treated as categorical and the second feature as numerical::
+
+  >>> gbdt = HistGradientBoostingClassifier(categorical_features=[True, False])
+
+.. topic:: Examples:
+
+  * :ref:`sphx_glr_auto_examples_ensemble_plot_gradient_boosting_categorical.py`
+
 .. _monotonic_cst_gbdt:
 
 Monotonic Constraints
@@ -1158,6 +1195,8 @@ Finally, many parts of the implementation of
   .. [LightGBM] Ke et. al. `"LightGBM: A Highly Efficient Gradient
      BoostingDecision Tree" <https://papers.nips.cc/paper/
      6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree>`_
+  .. [Fisher1958] Walter D. Fisher. `"On Grouping for Maximum Homogeneity"
+     <http://www.csiss.org/SPACE/workshops/2004/SAC/files/fisher.pdf>`_
 
 .. _voting_classifier:
 

diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -0,0 +1,91 @@
+"""
+========================================
+Categorical Support in Gradient Boosting
+========================================
+
+.. currentmodule:: sklearn
+
+In this example, we will compare the performance of
+:class:`~ensemble.HistGradientBoostingRegressor` using one hot encoding
+and with native categorical support.
+
+We will work with the Ames Lowa Housing dataset which consists of numerical
+and categorical features, where the houses' sales prices is the target.
+"""
+##############################################################################
+# Load Ames Housing dataset
+# -------------------------
+# First, we load the ames housing data as a pandas dataframe. The features
+# are either categorical or numerical:
+print(__doc__)
+
+from sklearn.datasets import fetch_openml
+
+X, y = fetch_openml(data_id=41211, as_frame=True, return_X_y=True)
+
+n_features = X.shape[1]
+n_categorical_features = (X.dtypes == 'category').sum()
+n_numerical_features = (X.dtypes == 'float').sum()
+print(f"Number of features: {X.shape[1]}")
+print(f"Number of categorical featuers: {n_categorical_features}")
+print(f"Number of numerical featuers: {n_numerical_features}")
+
+##############################################################################
+# Create gradient boosting estimator with one hot encoding
+# --------------------------------------------------------
+# Next, we create a pipeline that will one hot encode the categorical features
+# and let rest of the numerical data to passthrough:
+
+from sklearn.experimental import enable_hist_gradient_boosting  # noqa
+from sklearn.ensemble import HistGradientBoostingRegressor
+from sklearn.pipeline import make_pipeline
+from sklearn.compose import make_column_transformer
+from sklearn.compose import make_column_selector
+from sklearn.preprocessing import OneHotEncoder
+
+preprocessor = make_column_transformer(
+    (OneHotEncoder(sparse=False, handle_unknown='ignore'),
+     make_column_selector(dtype_include='category')),
+    remainder='passthrough')
+
+hist_one_hot = make_pipeline(preprocessor,
+                             HistGradientBoostingRegressor(random_state=42))
+
+##############################################################################
+# Create gradient boosting estimator with native categorical support
+# ------------------------------------------------------------------
+# The :class:`~ensemble.HistGradientBoostingRegressor` has native support
+# for categorical features using the `categorical_features` parameter:
+
+hist_native = HistGradientBoostingRegressor(categorical_features='pandas',
+                                            random_state=42)
+
+##############################################################################
+# Train the models with cross-validation
+# --------------------------------
+# Finally, we train the models using cross validation. Here we compare the
+# models performance in terms of :func:`~metrics.r2_score` and fit times. We
+# show that fit times are faster with native categorical support and that the
+# test scores and scores times are comparable:
+
+from sklearn.model_selection import cross_validate
+import matplotlib.pyplot as plt
+import numpy as np
+
+one_hot_result = cross_validate(hist_one_hot, X, y)
+native_result = cross_validate(hist_native, X, y)
+
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
+
+plot_info = [('fit_time', 'Fit times (s)', ax1),
+             ('test_score', 'Test Scores (r2 score)', ax2)]
+
+x, width = np.arange(2), 0.9
+for key, title, ax in plot_info:
+    items = [native_result[key], one_hot_result[key]]
+    ax.bar(x, [np.mean(item) for item in items],
+           width, yerr=[np.std(item) for item in items],
+           color=['b', 'r'])
+    ax.set(xlabel='Split number', title=title, xticks=[0, 1],
+           xticklabels=['Native', "One Hot"])
+plt.show()
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx b/sklearn/ensemble/_hist_gradient_boosting/_binning.pyx
@@ -15,15 +15,19 @@ from cython.parallel import prange
 from libc.math cimport isnan
 
 from .common cimport X_DTYPE_C, X_BINNED_DTYPE_C
+from ._cat_mapper cimport CategoryMapper
 
 np.import_array()
 
 
 def _map_to_bins(const X_DTYPE_C [:, :] data,
                  list binning_thresholds,
                  const unsigned char missing_values_bin_idx,
+                 CategoryMapper category_mapper,
+                 const unsigned char[::1] is_categorical,
                  X_BINNED_DTYPE_C [::1, :] binned):
     """Bin numerical values to discrete integer-coded levels.
+    TODO docstring needs update
 
     Parameters
     ----------
@@ -32,17 +36,24 @@ def _map_to_bins(const X_DTYPE_C [:, :] data,
     binning_thresholds : list of arrays
         For each feature, stores the increasing numeric values that are
         used to separate the bins.
+    is_categorical : ndarray, shape (n_features,)
+        Indicates categorical features.
     binned : ndarray, shape (n_samples, n_features)
         Output array, must be fortran aligned.
     """
     cdef:
         int feature_idx
 
     for feature_idx in range(data.shape[1]):
-        _map_num_col_to_bins(data[:, feature_idx],
-                             binning_thresholds[feature_idx],
-                             missing_values_bin_idx,
-                             binned[:, feature_idx])
+        if is_categorical[feature_idx]:
+            _map_cat_col_to_bins(data[:, feature_idx], feature_idx,
+                                 category_mapper, missing_values_bin_idx,
+                                 binned[:, feature_idx])
+        else:
+            _map_num_col_to_bins(data[:, feature_idx],
+                                 binning_thresholds[feature_idx],
+                                 missing_values_bin_idx,
+                                 binned[:, feature_idx])
 
 
 cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,
@@ -71,3 +82,14 @@ cdef void _map_num_col_to_bins(const X_DTYPE_C [:] data,
                 else:
                     left = middle + 1
             binned[i] = left
+
+
+cdef void _map_cat_col_to_bins(const X_DTYPE_C [:] data,
+                               int feature_idx,
+                               CategoryMapper category_mapper,
+                               const unsigned char missing_values_bin_idx,
+                               X_BINNED_DTYPE_C [:] binned):
+        """Map form raw categories to bin"""
+        cdef int i
+        for i in prange(data.shape[0], schedule='static', nogil=True):
+            binned[i] = category_mapper.map_to_bin(feature_idx, data[i])
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pxd
@@ -0,0 +1,9 @@
+# cython: language_level=3
+from .common cimport X_BINNED_DTYPE_C
+from .common cimport BITSET_DTYPE_C
+
+cdef void init_bitset(BITSET_DTYPE_C bitset) nogil
+
+cdef void set_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) nogil
+
+cdef unsigned char in_bitset(BITSET_DTYPE_C bitset, X_BINNED_DTYPE_C val) nogil
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx b/sklearn/ensemble/_hist_gradient_boosting/_bitset.pyx
@@ -0,0 +1,38 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+from .common cimport BITSET_INNER_DTYPE_C
+
+cdef inline void init_bitset(BITSET_DTYPE_C bitset) nogil: # OUT
+    cdef:
+        unsigned int i
+
+    for i in range(8):
+        bitset[i] = 0
+
+cdef inline void set_bitset(BITSET_DTYPE_C bitset,  # OUT
+                            X_BINNED_DTYPE_C val) nogil:
+    cdef:
+        unsigned int i1 = val // 32
+        unsigned int i2 = val % 32
+
+    # It is assumed that val < 256 so that i1 < 8
+    bitset[i1] |= (1 << i2)
+
+cdef inline unsigned char in_bitset(BITSET_DTYPE_C bitset,
+                                    X_BINNED_DTYPE_C val) nogil:
+    cdef:
+        unsigned int i1 = val // 32
+        unsigned int i2 = val % 32
+
+    return (bitset[i1] >> i2) & 1
+
+
+def set_bitset_py(BITSET_INNER_DTYPE_C[:] bitset, X_BINNED_DTYPE_C val):
+    cdef:
+        unsigned int i1 = val // 32
+        unsigned int i2 = val % 32
+
+    # It is assumed that val < 256 so that i1 < 8
+    bitset[i1] |= (1 << i2)
diff --git a/sklearn/ensemble/_hist_gradient_boosting/_cat_mapper.pxd b/sklearn/ensemble/_hist_gradient_boosting/_cat_mapper.pxd
@@ -0,0 +1,19 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: language_level=3
+# cython: nonecheck=False
+# distutils: language=c++
+
+from libcpp.map cimport map
+from libcpp.vector cimport vector
+from .common cimport X_DTYPE_C
+from .common cimport X_BINNED_DTYPE_C
+
+cdef class CategoryMapper:
+    cdef:
+        map[int, map[int, X_BINNED_DTYPE_C]] raw_category_to_bin
+        X_BINNED_DTYPE_C missing_values_bin_idx
+
+    cdef X_BINNED_DTYPE_C map_to_bin(self, int feature_idx,
+                                     X_DTYPE_C raw_category) nogil