From 8310a2ae04473a8d36e23c12a51343fc7d698f5b Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Thu, 2 May 2024 19:58:56 +0000 Subject: [PATCH 01/13] add missing base.clusterer(), unit tests --- sklearn/base.py | 39 +++++++++++++++++++++++++++++++++ sklearn/tests/test_base.py | 45 +++++++++++++++++++++++++++++++++++++- 2 files changed, 83 insertions(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index d0f861bd2278f..07210a882ee69 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -1374,13 +1374,17 @@ def is_classifier(estimator): Examples -------- >>> from sklearn.base import is_classifier + >>> from sklearn.cluster import KMeans >>> from sklearn.svm import SVC, SVR >>> classifier = SVC() >>> regressor = SVR() + >>> kmeans = KMeans() >>> is_classifier(classifier) True >>> is_classifier(regressor) False + >>> is_classifier(kmeans) + False """ return getattr(estimator, "_estimator_type", None) == "classifier" @@ -1401,17 +1405,52 @@ def is_regressor(estimator): Examples -------- >>> from sklearn.base import is_regressor + >>> from sklearn.cluster import KMeans >>> from sklearn.svm import SVC, SVR >>> classifier = SVC() >>> regressor = SVR() + >>> kmeans = KMeans() >>> is_regressor(classifier) False >>> is_regressor(regressor) True + >>> is_regressor(kmeans) + False """ return getattr(estimator, "_estimator_type", None) == "regressor" +def is_clusterer(estimator): + """Return True if the given estimator is (probably) a clusterer. + + Parameters + ---------- + estimator : object + Estimator object to test. + + Returns + ------- + out : bool + True if estimator is a clusterer and False otherwise. + + Examples + -------- + >>> from sklearn.base import is_clusterer + >>> from sklearn.cluster import KMeans + >>> from sklearn.svm import SVC, SVR + >>> classifier = SVC() + >>> regressor = SVR() + >>> kmeans = KMeans() + >>> is_clusterer(classifier) + False + >>> is_clusterer(regressor) + False + >>> is_clusterer(kmeans) + True + """ + return getattr(estimator, "_estimator_type", None) == "clusterer" + + def is_outlier_detector(estimator): """Return True if the given estimator is (probably) an outlier detector. diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 3bbc236e703df..b7fcf1d1da686 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -18,13 +18,16 @@ TransformerMixin, clone, is_classifier, + is_regressor, + is_clusterer, ) +from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.exceptions import InconsistentVersionWarning from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler -from sklearn.svm import SVC +from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils._mocking import MockDataFrame from sklearn.utils._set_output import _get_output_config @@ -260,12 +263,52 @@ def test_get_params(): def test_is_classifier(): + # classifier cases svc = SVC() assert is_classifier(svc) assert is_classifier(GridSearchCV(svc, {"C": [0.1, 1]})) assert is_classifier(Pipeline([("svc", svc)])) assert is_classifier(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))])) + # non-classifier cases + svr = SVR() + assert not is_classifier(svr) + assert not is_classifier(GridSearchCV(svr, {"C": [0.1, 1]})) + assert not is_classifier(Pipeline([("svr", svr)])) + assert not is_classifier(Pipeline([("svr_cv", GridSearchCV(svr, {"C": [0.1, 1]}))])) + + +def test_is_regressor(): + # regressor cases + svr = SVR() + assert is_regressor(svr) + assert is_regressor(GridSearchCV(svr, {"C": [0.1, 1]})) + assert is_regressor(Pipeline([("svr", svr)])) + assert is_regressor(Pipeline([("svr_cv", GridSearchCV(svr, {"C": [0.1, 1]}))])) + + # non-regressor cases + svc = SVC() + assert not is_regressor(svc) + assert not is_regressor(GridSearchCV(svc, {"C": [0.1, 1]})) + assert not is_regressor(Pipeline([("svc", svc)])) + assert not is_regressor(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))])) + + +def test_is_clusterer(): + # clusterer cases + kmeans = KMeans() + assert is_clusterer(kmeans) + assert is_clusterer(GridSearchCV(kmeans, {"n_clusters": [3, 8]})) + assert is_clusterer(Pipeline([("kmeans", kmeans)])) + assert is_clusterer(Pipeline([("kmeans_cv", GridSearchCV(kmeans, {"n_clusters": [3, 8]}))])) + + # non-clusterer cases + svc = SVC() + assert not is_clusterer(svc) + assert not is_clusterer(GridSearchCV(svc, {"C": [0.1, 1]})) + assert not is_clusterer(Pipeline([("svc", svc)])) + assert not is_clusterer(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))])) + def test_set_params(): # test nested estimator parameter setting From fa98b4de575f7ae324ebf1e7044bab25af986cea Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Thu, 2 May 2024 20:25:35 +0000 Subject: [PATCH 02/13] fix lint issues --- sklearn/tests/test_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index b7fcf1d1da686..af19e25e4605c 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -300,7 +300,9 @@ def test_is_clusterer(): assert is_clusterer(kmeans) assert is_clusterer(GridSearchCV(kmeans, {"n_clusters": [3, 8]})) assert is_clusterer(Pipeline([("kmeans", kmeans)])) - assert is_clusterer(Pipeline([("kmeans_cv", GridSearchCV(kmeans, {"n_clusters": [3, 8]}))])) + assert is_clusterer( + Pipeline([("kmeans_cv", GridSearchCV(kmeans, {"n_clusters": [3, 8]}))]) + ) # non-clusterer cases svc = SVC() From 1e081672e7bb606ca5537a3d63a018b3a88d0ea3 Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Thu, 2 May 2024 20:34:38 +0000 Subject: [PATCH 03/13] fix ruff issues --- sklearn/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index af19e25e4605c..943072ec56ac0 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -18,8 +18,8 @@ TransformerMixin, clone, is_classifier, - is_regressor, is_clusterer, + is_regressor, ) from sklearn.cluster import KMeans from sklearn.decomposition import PCA From 194803a5521483dab1be185f3cd4857162e25c48 Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Mon, 6 May 2024 21:04:09 +0200 Subject: [PATCH 04/13] Update changelog --- doc/whats_new/v1.5.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index b7d93de9ac2b0..498fe6c98db97 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -165,6 +165,13 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123455 is the *pull request* number, not the issue number. +:mod:`sklearn.base` +................... + +- |Enhancement| Added a function :func:`base.is_clusterer` which determines + whether a given estimator is of category clusterer. + :pr:`28936` by :user:`Christian Veenhuis `. + :mod:`sklearn.calibration` .......................... From 5bb23d06c6855856747f27d15318af5d5f2ccbab Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Wed, 8 May 2024 12:39:36 +0000 Subject: [PATCH 05/13] moved changelog entry from 1.5 to 1.6 --- doc/whats_new/v1.5.rst | 7 ------- doc/whats_new/v1.6.rst | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v1.5.rst b/doc/whats_new/v1.5.rst index 6592d04eec810..e50309a330e39 100644 --- a/doc/whats_new/v1.5.rst +++ b/doc/whats_new/v1.5.rst @@ -165,13 +165,6 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123455 is the *pull request* number, not the issue number. -:mod:`sklearn.base` -................... - -- |Enhancement| Added a function :func:`base.is_clusterer` which determines - whether a given estimator is of category clusterer. - :pr:`28936` by :user:`Christian Veenhuis `. - :mod:`sklearn.calibration` .......................... diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst index 6eda6717b3d1b..0f003a4cba534 100644 --- a/doc/whats_new/v1.6.rst +++ b/doc/whats_new/v1.6.rst @@ -54,6 +54,13 @@ Changelog :pr:`123456` by :user:`Joe Bloggs `. where 123455 is the *pull request* number, not the issue number. +:mod:`sklearn.base` +................... + +- |Enhancement| Added a function :func:`base.is_clusterer` which determines + whether a given estimator is of category clusterer. + :pr:`28936` by :user:`Christian Veenhuis `. + Thanks to everyone who has contributed to the maintenance and improvement of the project since version 1.5, including: From 0c5cccd091a8da22a2037dab899a392b4cc9f525 Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Mon, 20 May 2024 17:27:42 +0000 Subject: [PATCH 06/13] added to classes.rst --- doc/modules/classes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 1da5b337ad7a4..6279b0d715968 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -68,6 +68,7 @@ Functions base.clone base.is_classifier base.is_regressor + base.is_clusterer .. _calibration_ref: From 13eca720c4e4ea84bb027782b313bd9e9015011b Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Mon, 20 May 2024 17:43:23 +0000 Subject: [PATCH 07/13] use parameterized unit tests --- sklearn/tests/test_base.py | 98 ++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 47 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index af7871d3b9570..c05b8d5338867 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -262,54 +262,58 @@ def test_get_params(): test.set_params(a__a=2) -def test_is_classifier(): - # classifier cases - svc = SVC() - assert is_classifier(svc) - assert is_classifier(GridSearchCV(svc, {"C": [0.1, 1]})) - assert is_classifier(Pipeline([("svc", svc)])) - assert is_classifier(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))])) - - # non-classifier cases - svr = SVR() - assert not is_classifier(svr) - assert not is_classifier(GridSearchCV(svr, {"C": [0.1, 1]})) - assert not is_classifier(Pipeline([("svr", svr)])) - assert not is_classifier(Pipeline([("svr_cv", GridSearchCV(svr, {"C": [0.1, 1]}))])) - - -def test_is_regressor(): - # regressor cases - svr = SVR() - assert is_regressor(svr) - assert is_regressor(GridSearchCV(svr, {"C": [0.1, 1]})) - assert is_regressor(Pipeline([("svr", svr)])) - assert is_regressor(Pipeline([("svr_cv", GridSearchCV(svr, {"C": [0.1, 1]}))])) - - # non-regressor cases - svc = SVC() - assert not is_regressor(svc) - assert not is_regressor(GridSearchCV(svc, {"C": [0.1, 1]})) - assert not is_regressor(Pipeline([("svc", svc)])) - assert not is_regressor(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))])) - - -def test_is_clusterer(): - # clusterer cases - kmeans = KMeans() - assert is_clusterer(kmeans) - assert is_clusterer(GridSearchCV(kmeans, {"n_clusters": [3, 8]})) - assert is_clusterer(Pipeline([("kmeans", kmeans)])) - assert is_clusterer( - Pipeline([("kmeans_cv", GridSearchCV(kmeans, {"n_clusters": [3, 8]}))]) - ) +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (SVC(), True), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), True), + (Pipeline([("svc", SVC())]), True), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), True), + + (SVR(), False), + (GridSearchCV(SVR(), {"C": [0.1, 1]}), False), + (Pipeline([("svr", SVR())]), False), + (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), False), + ] +) +def test_is_classifier(estimator, expected_result): + assert is_classifier(estimator) == expected_result + + +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (SVR(), True), + (GridSearchCV(SVR(), {"C": [0.1, 1]}), True), + (Pipeline([("svr", SVR())]), True), + (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), True), + + (SVC(), False), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), + (Pipeline([("svc", SVC())]), False), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), + ] +) +def test_is_regressor(estimator, expected_result): + assert is_regressor(estimator) == expected_result + - # non-clusterer cases - svc = SVC() - assert not is_clusterer(svc) - assert not is_clusterer(GridSearchCV(svc, {"C": [0.1, 1]})) - assert not is_clusterer(Pipeline([("svc", svc)])) - assert not is_clusterer(Pipeline([("svc_cv", GridSearchCV(svc, {"C": [0.1, 1]}))])) +@pytest.mark.parametrize( + "estimator, expected_result", + [ + (KMeans(), True), + (GridSearchCV(KMeans(), {"n_clusters": [3, 8]}), True), + (Pipeline([("kmeans", KMeans())]), True), + (Pipeline([("kmeans_cv", GridSearchCV(KMeans(), {"n_clusters": [3, 8]}))]), True), + + (SVC(), False), + (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), + (Pipeline([("svc", SVC())]), False), + (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), + ] +) +def test_is_clusterer(estimator, expected_result): + assert is_clusterer(estimator) == expected_result def test_set_params(): From ffdd8611c7482307a3ccea2c101cc57d3011023c Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Mon, 20 May 2024 18:26:50 +0000 Subject: [PATCH 08/13] fix ruff stuff --- sklearn/tests/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index c05b8d5338867..91366a3c4382b 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -303,8 +303,8 @@ def test_is_regressor(estimator, expected_result): [ (KMeans(), True), (GridSearchCV(KMeans(), {"n_clusters": [3, 8]}), True), - (Pipeline([("kmeans", KMeans())]), True), - (Pipeline([("kmeans_cv", GridSearchCV(KMeans(), {"n_clusters": [3, 8]}))]), True), + (Pipeline([("km", KMeans())]), True), + (Pipeline([("km_cv", GridSearchCV(KMeans(), {"n_clusters": [3, 8]}))]), True), (SVC(), False), (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), From 6b868126cb0cf0fa12a06bd3d8bdf8ae85ce04ac Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Mon, 20 May 2024 18:31:34 +0000 Subject: [PATCH 09/13] fix black stuff --- sklearn/tests/test_base.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sklearn/tests/test_base.py b/sklearn/tests/test_base.py index 91366a3c4382b..917da863ece3b 100644 --- a/sklearn/tests/test_base.py +++ b/sklearn/tests/test_base.py @@ -269,12 +269,11 @@ def test_get_params(): (GridSearchCV(SVC(), {"C": [0.1, 1]}), True), (Pipeline([("svc", SVC())]), True), (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), True), - (SVR(), False), (GridSearchCV(SVR(), {"C": [0.1, 1]}), False), (Pipeline([("svr", SVR())]), False), (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), False), - ] + ], ) def test_is_classifier(estimator, expected_result): assert is_classifier(estimator) == expected_result @@ -287,12 +286,11 @@ def test_is_classifier(estimator, expected_result): (GridSearchCV(SVR(), {"C": [0.1, 1]}), True), (Pipeline([("svr", SVR())]), True), (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), True), - (SVC(), False), (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), (Pipeline([("svc", SVC())]), False), (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), - ] + ], ) def test_is_regressor(estimator, expected_result): assert is_regressor(estimator) == expected_result @@ -305,12 +303,11 @@ def test_is_regressor(estimator, expected_result): (GridSearchCV(KMeans(), {"n_clusters": [3, 8]}), True), (Pipeline([("km", KMeans())]), True), (Pipeline([("km_cv", GridSearchCV(KMeans(), {"n_clusters": [3, 8]}))]), True), - (SVC(), False), (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), (Pipeline([("svc", SVC())]), False), (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), - ] + ], ) def test_is_clusterer(estimator, expected_result): assert is_clusterer(estimator) == expected_result From f82c6169bf860175cf3255a03d9ad2e212e8de4d Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Tue, 21 May 2024 09:30:13 +0000 Subject: [PATCH 10/13] remove classes.rst entry --- doc/modules/classes.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 6279b0d715968..1da5b337ad7a4 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -68,7 +68,6 @@ Functions base.clone base.is_classifier base.is_regressor - base.is_clusterer .. _calibration_ref: From e55067dcf6605024d084cb207faf05593c4bb356 Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Tue, 21 May 2024 09:33:24 +0000 Subject: [PATCH 11/13] added base.is_clusterer to api_reference.py --- doc/api_reference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/api_reference.py b/doc/api_reference.py index 1aa6455fb7e44..0f3467bc717b5 100644 --- a/doc/api_reference.py +++ b/doc/api_reference.py @@ -122,6 +122,7 @@ def _get_submodule(module_name, submodule_name): "clone", "is_classifier", "is_regressor", + "is_clusterer", ], } ], From 5f67d736acdca3569595647ff44fd40571bad2c1 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Tue, 21 May 2024 14:01:07 +0200 Subject: [PATCH 12/13] Update doc/api_reference.py --- doc/api_reference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/api_reference.py b/doc/api_reference.py index 0f3467bc717b5..583909cdcac65 100644 --- a/doc/api_reference.py +++ b/doc/api_reference.py @@ -121,8 +121,8 @@ def _get_submodule(module_name, submodule_name): "TransformerMixin", "clone", "is_classifier", - "is_regressor", "is_clusterer", + "is_regressor", ], } ], From 1a5df505c7d3563533984732b5db9204aba6edd1 Mon Sep 17 00:00:00 2001 From: Christian Veenhuis <124370897+ChVeen@users.noreply.github.com> Date: Wed, 22 May 2024 09:26:32 +0000 Subject: [PATCH 13/13] added version tag --- sklearn/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/base.py b/sklearn/base.py index ea29935540530..d4245ade4e499 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -1423,6 +1423,8 @@ def is_regressor(estimator): def is_clusterer(estimator): """Return True if the given estimator is (probably) a clusterer. + .. versionadded:: 1.6 + Parameters ---------- estimator : object