Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 40 additions & 21 deletions examples/preprocessing/plot_discretization_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,33 +61,53 @@ def get_name(estimator):


# list of (estimator, param_grid), where param_grid is used in GridSearchCV
# The parameter spaces in this example are limited to a narrow band to reduce
# its runtime. In a real use case, a broader search space for the algorithms
# should be used.
classifiers = [
(LogisticRegression(random_state=0), {"C": np.logspace(-2, 7, 10)}),
(LinearSVC(random_state=0), {"C": np.logspace(-2, 7, 10)}),
(
make_pipeline(StandardScaler(), LogisticRegression(random_state=0)),
{"logisticregression__C": np.logspace(-1, 1, 3)},
),
(
make_pipeline(StandardScaler(), LinearSVC(random_state=0)),
{"linearsvc__C": np.logspace(-1, 1, 3)},
),
(
make_pipeline(
KBinsDiscretizer(encode="onehot"), LogisticRegression(random_state=0)
StandardScaler(),
KBinsDiscretizer(encode="onehot"),
LogisticRegression(random_state=0),
),
{
"kbinsdiscretizer__n_bins": np.arange(2, 10),
"logisticregression__C": np.logspace(-2, 7, 10),
"kbinsdiscretizer__n_bins": np.arange(5, 8),
"logisticregression__C": np.logspace(-1, 1, 3),
},
),
(
make_pipeline(KBinsDiscretizer(encode="onehot"), LinearSVC(random_state=0)),
make_pipeline(
StandardScaler(),
KBinsDiscretizer(encode="onehot"),
LinearSVC(random_state=0),
),
{
"kbinsdiscretizer__n_bins": np.arange(2, 10),
"linearsvc__C": np.logspace(-2, 7, 10),
"kbinsdiscretizer__n_bins": np.arange(5, 8),
"linearsvc__C": np.logspace(-1, 1, 3),
},
),
(
GradientBoostingClassifier(n_estimators=50, random_state=0),
{"learning_rate": np.logspace(-4, 0, 10)},
make_pipeline(
StandardScaler(), GradientBoostingClassifier(n_estimators=5, random_state=0)
),
{"gradientboostingclassifier__learning_rate": np.logspace(-2, 0, 5)},
),
(
make_pipeline(StandardScaler(), SVC(random_state=0)),
{"svc__C": np.logspace(-1, 1, 3)},
),
(SVC(random_state=0), {"C": np.logspace(-2, 7, 10)}),
]

names = [get_name(e) for e, g in classifiers]
names = [get_name(e).replace("StandardScaler + ", "") for e, _ in classifiers]

n_samples = 100
datasets = [
Expand All @@ -107,15 +127,14 @@ def get_name(estimator):
nrows=len(datasets), ncols=len(classifiers) + 1, figsize=(21, 9)
)

cm = plt.cm.PiYG
cm_piyg = plt.cm.PiYG
cm_bright = ListedColormap(["#b30065", "#178000"])

# iterate over datasets
for ds_cnt, (X, y) in enumerate(datasets):
print("\ndataset %d\n---------" % ds_cnt)
print(f"\ndataset {ds_cnt}\n---------")

# preprocess dataset, split into training and test part
X = StandardScaler().fit_transform(X)
# split into training and test part
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=42
)
Expand Down Expand Up @@ -148,18 +167,18 @@ def get_name(estimator):
with ignore_warnings(category=ConvergenceWarning):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("%s: %.2f" % (name, score))
print(f"{name}: {score:.2f}")

# plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]*[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = clf.decision_function(np.column_stack([xx.ravel(), yy.ravel()]))
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = clf.predict_proba(np.column_stack([xx.ravel(), yy.ravel()]))[:, 1]

# put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8)
ax.contourf(xx, yy, Z, cmap=cm_piyg, alpha=0.8)

# plot the training points
ax.scatter(
Expand All @@ -184,7 +203,7 @@ def get_name(estimator):
ax.text(
0.95,
0.06,
("%.2f" % score).lstrip("0"),
(f"{score:.2f}").lstrip("0"),
size=15,
bbox=dict(boxstyle="round", alpha=0.8, facecolor="white"),
transform=ax.transAxes,
Expand Down