From cccb39038e102c27781eb8a6ee8a32bf7be9e327 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Sun, 14 Nov 2021 11:19:53 +0100 Subject: [PATCH 1/5] Speed up plot_discretization_classification.py --- .../plot_discretization_classification.py | 58 ++++++++++++------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py index 59cdbdb766a12..f1a0e89d1f308 100644 --- a/examples/preprocessing/plot_discretization_classification.py +++ b/examples/preprocessing/plot_discretization_classification.py @@ -62,32 +62,49 @@ def get_name(estimator): # list of (estimator, param_grid), where param_grid is used in GridSearchCV classifiers = [ - (LogisticRegression(random_state=0), {"C": np.logspace(-2, 7, 10)}), - (LinearSVC(random_state=0), {"C": np.logspace(-2, 7, 10)}), + ( + make_pipeline(StandardScaler(), LogisticRegression(random_state=0)), + {"logisticregression__C": np.logspace(-1, 1, 3)}, + ), + ( + make_pipeline(StandardScaler(), LinearSVC(random_state=0)), + {"linearsvc__C": np.logspace(-1, 1, 3)}, + ), ( make_pipeline( - KBinsDiscretizer(encode="onehot"), LogisticRegression(random_state=0) + StandardScaler(), + KBinsDiscretizer(encode="onehot"), + LogisticRegression(random_state=0), ), { - "kbinsdiscretizer__n_bins": np.arange(2, 10), - "logisticregression__C": np.logspace(-2, 7, 10), + "kbinsdiscretizer__n_bins": np.arange(5, 8), + "logisticregression__C": np.logspace(-1, 1, 3), }, ), ( - make_pipeline(KBinsDiscretizer(encode="onehot"), LinearSVC(random_state=0)), + make_pipeline( + StandardScaler(), + KBinsDiscretizer(encode="onehot"), + LinearSVC(random_state=0), + ), { - "kbinsdiscretizer__n_bins": np.arange(2, 10), - "linearsvc__C": np.logspace(-2, 7, 10), + "kbinsdiscretizer__n_bins": np.arange(5, 8), + "linearsvc__C": np.logspace(-1, 1, 3), }, ), ( - GradientBoostingClassifier(n_estimators=50, random_state=0), - {"learning_rate": np.logspace(-4, 0, 10)}, + make_pipeline( + StandardScaler(), GradientBoostingClassifier(n_estimators=5, random_state=0) + ), + {"gradientboostingclassifier__learning_rate": np.logspace(-2, 0, 5)}, + ), + ( + make_pipeline(StandardScaler(), SVC(random_state=0)), + {"svc__C": np.logspace(-1, 1, 3)}, ), - (SVC(random_state=0), {"C": np.logspace(-2, 7, 10)}), ] -names = [get_name(e) for e, g in classifiers] +names = [get_name(e).replace("StandardScaler + ", "") for e, _ in classifiers] n_samples = 100 datasets = [ @@ -107,15 +124,14 @@ def get_name(estimator): nrows=len(datasets), ncols=len(classifiers) + 1, figsize=(21, 9) ) -cm = plt.cm.PiYG +cm_piyg = plt.cm.PiYG cm_bright = ListedColormap(["#b30065", "#178000"]) # iterate over datasets for ds_cnt, (X, y) in enumerate(datasets): - print("\ndataset %d\n---------" % ds_cnt) + print(f"\ndataset {ds_cnt}\n---------") - # preprocess dataset, split into training and test part - X = StandardScaler().fit_transform(X) + # split into training and test part X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, random_state=42 ) @@ -148,18 +164,18 @@ def get_name(estimator): with ignore_warnings(category=ConvergenceWarning): clf.fit(X_train, y_train) score = clf.score(X_test, y_test) - print("%s: %.2f" % (name, score)) + print(f"{name}: {score:.2f}") # plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]*[y_min, y_max]. if hasattr(clf, "decision_function"): - Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + Z = clf.decision_function(np.column_stack([xx.ravel(), yy.ravel()])) else: - Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] + Z = clf.predict_proba(np.column_stack([xx.ravel(), yy.ravel()]))[:, 1] # put the result into a color plot Z = Z.reshape(xx.shape) - ax.contourf(xx, yy, Z, cmap=cm, alpha=0.8) + ax.contourf(xx, yy, Z, cmap=cm_piyg, alpha=0.8) # plot the training points ax.scatter( @@ -184,7 +200,7 @@ def get_name(estimator): ax.text( 0.95, 0.06, - ("%.2f" % score).lstrip("0"), + (f"{score:.2f}").lstrip("0"), size=15, bbox=dict(boxstyle="round", alpha=0.8, facecolor="white"), transform=ax.transAxes, From 8323bf7e26666c59a1d3d4d24bc97b1e34cea69e Mon Sep 17 00:00:00 2001 From: Xiao Yuan Date: Mon, 15 Nov 2021 16:16:55 +0100 Subject: [PATCH 2/5] Add comment --- examples/preprocessing/plot_discretization_classification.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py index f1a0e89d1f308..1ca16cfcbe3e7 100644 --- a/examples/preprocessing/plot_discretization_classification.py +++ b/examples/preprocessing/plot_discretization_classification.py @@ -61,6 +61,8 @@ def get_name(estimator): # list of (estimator, param_grid), where param_grid is used in GridSearchCV +# The parameter spaces in this example are limited to a narrow band in order to speed up. +# In a real use case, a broader search space for the algorithms should be used. classifiers = [ ( make_pipeline(StandardScaler(), LogisticRegression(random_state=0)), From aafb038eb8a64add8c24eb4bdb4850b8f5ae86a2 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Mon, 15 Nov 2021 16:27:52 +0100 Subject: [PATCH 3/5] Reduce length of comment --- examples/preprocessing/plot_discretization_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py index 1ca16cfcbe3e7..09fc9712ff601 100644 --- a/examples/preprocessing/plot_discretization_classification.py +++ b/examples/preprocessing/plot_discretization_classification.py @@ -61,7 +61,7 @@ def get_name(estimator): # list of (estimator, param_grid), where param_grid is used in GridSearchCV -# The parameter spaces in this example are limited to a narrow band in order to speed up. +# The parameter spaces in this example are limited to a narrow band to speed up. # In a real use case, a broader search space for the algorithms should be used. classifiers = [ ( From 69f5ff47e255c2d4638de2c6c9b8a64502ba3b89 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Mon, 15 Nov 2021 17:35:35 +0100 Subject: [PATCH 4/5] Improve comment --- .../preprocessing/plot_discretization_classification.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py index 09fc9712ff601..8cfd6d597b930 100644 --- a/examples/preprocessing/plot_discretization_classification.py +++ b/examples/preprocessing/plot_discretization_classification.py @@ -60,9 +60,11 @@ def get_name(estimator): return name +# The parameter spaces in this example are limited to a narrow band to reduce +# its runtime. In a real use case, a broader search space for the algorithms +# should be used. + # list of (estimator, param_grid), where param_grid is used in GridSearchCV -# The parameter spaces in this example are limited to a narrow band to speed up. -# In a real use case, a broader search space for the algorithms should be used. classifiers = [ ( make_pipeline(StandardScaler(), LogisticRegression(random_state=0)), From fecca549e3f26359b74f4bdb9574ac1876a0bc50 Mon Sep 17 00:00:00 2001 From: yuanx749 Date: Mon, 15 Nov 2021 19:00:26 +0100 Subject: [PATCH 5/5] Merge main --- examples/preprocessing/plot_discretization_classification.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/preprocessing/plot_discretization_classification.py b/examples/preprocessing/plot_discretization_classification.py index 8cfd6d597b930..ff3d2973caff3 100644 --- a/examples/preprocessing/plot_discretization_classification.py +++ b/examples/preprocessing/plot_discretization_classification.py @@ -60,11 +60,10 @@ def get_name(estimator): return name +# list of (estimator, param_grid), where param_grid is used in GridSearchCV # The parameter spaces in this example are limited to a narrow band to reduce # its runtime. In a real use case, a broader search space for the algorithms # should be used. - -# list of (estimator, param_grid), where param_grid is used in GridSearchCV classifiers = [ ( make_pipeline(StandardScaler(), LogisticRegression(random_state=0)),