scikit-learn · qinhanmin2014 · Nov 27, 2017 · Nov 23, 2017 · Nov 23, 2017 · Nov 23, 2017
diff --git a/examples/preprocessing/plot_discretization.py b/examples/preprocessing/plot_discretization.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+
+"""
+================================================================
+Using KBinsDiscretizer to discretize continuous features
+================================================================
+
+The example compares prediction result of linear regression (linear model)
+and decision tree (tree based model) with and without discretization of
+real-valued features.
+
+As is shown in the result before discretization, linear model is fast to
+build and relatively straightforward to interpret, but can only model
+linear relationships, while decision tree can build a much more complex model
+of the data. One way to make linear model more powerful on continuous data
+is to use discretization (also known as binning). In the example, we
+discretize the feature and one-hot encode the transformed data. Note that if
+the bins are not reasonably wide, there would appear to be a substantially
+increased risk of overfitting, so the discretizer parameters should usually
+be tuned under cross validation.
+
+After discretization, linear regression and decision tree make exactly the
+same prediction. As features are constant within each bin, any model must
+predict the same value for all points within a bin. Compared with the result
+before discretization, linear model become much more flexible while decision
+tree gets much less flexible. Note that binning features generally has no
+beneficial effect for tree-based models, as these models can learn to split
+up the data anywhere.
+
+"""
+
+# Author: Andreas Müller
+#         Hanmin Qin <qinhanmin2005@sina.com>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import KBinsDiscretizer
+from sklearn.tree import DecisionTreeRegressor
+
+print(__doc__)
+
+# construct the dataset
+rnd = np.random.RandomState(42)
+X = rnd.uniform(-3, 3, size=100)
+y = np.sin(X) + rnd.normal(size=len(X)) / 3
+X = X.reshape(-1, 1)
+
+# transform the dataset with KBinsDiscretizer
+enc = KBinsDiscretizer(n_bins=10, encode='onehot')
+X_binned = enc.fit_transform(X)
+
+# predict with original dataset
+fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(10, 4))
+line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
+reg = LinearRegression().fit(X, y)
+ax1.plot(line, reg.predict(line), linewidth=2, color='green',
+         label="linear regression")
+reg = DecisionTreeRegressor(min_samples_split=3, random_state=0).fit(X, y)
+ax1.plot(line, reg.predict(line), linewidth=2, color='red',
+         label="decision tree")
+ax1.plot(X[:, 0], y, 'o', c='k')
+ax1.legend(loc="best")
+ax1.set_ylabel("Regression output")
+ax1.set_xlabel("Input feature")
+ax1.set_title("Result before discretization")
+
+# predict with transformed dataset
+line_binned = enc.transform(line)
+reg = LinearRegression().fit(X_binned, y)
+ax2.plot(line, reg.predict(line_binned), linewidth=2, color='green',
+         linestyle='-', label='linear regression')
+reg = DecisionTreeRegressor(min_samples_split=3,
+                            random_state=0).fit(X_binned, y)
+ax2.plot(line, reg.predict(line_binned), linewidth=2, color='red',
+         linestyle=':', label='decision tree')
+ax2.plot(X[:, 0], y, 'o', c='k')
+bins = enc.offset_[0] + enc.bin_width_[0] * np.arange(1, enc.n_bins_[0])
+ax2.vlines(bins, *plt.gca().get_ylim(), linewidth=1, alpha=.2)
+ax2.legend(loc="best")
+ax2.set_xlabel("Input feature")
+ax2.set_title("Result after discretization")
+
+plt.tight_layout()
+plt.show()