From f90766d39c02f9895a4973c4a8be351f7fc0aad2 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Tue, 26 Feb 2019 10:11:38 +0100 Subject: [PATCH 01/46] Basic OutlierResamplers --- sklearn/__init__.py | 5 +- sklearn/base.py | 50 +++++++++++++++++++ sklearn/resample/__init__.py | 6 +++ sklearn/resample/outlier_resample.py | 29 +++++++++++ .../resample/tests/test_outlier_resample.py | 25 ++++++++++ 5 files changed, 113 insertions(+), 2 deletions(-) create mode 100644 sklearn/resample/__init__.py create mode 100644 sklearn/resample/outlier_resample.py create mode 100644 sklearn/resample/tests/test_outlier_resample.py diff --git a/sklearn/__init__.py b/sklearn/__init__.py index aafc8a34b2a13..2de97d0e30843 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -73,8 +73,9 @@ 'kernel_ridge', 'linear_model', 'manifold', 'metrics', 'mixture', 'model_selection', 'multiclass', 'multioutput', 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', - 'preprocessing', 'random_projection', 'semi_supervised', - 'svm', 'tree', 'discriminant_analysis', 'impute', 'compose', + 'preprocessing', 'random_projection', 'resample', + 'semi_supervised','svm', 'tree', 'discriminant_analysis', + 'impute', 'compose', # Non-modules: 'clone', 'get_config', 'set_config', 'config_context', 'show_versions'] diff --git a/sklearn/base.py b/sklearn/base.py index d31286e6a1ab4..0ba642a814e47 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -544,6 +544,56 @@ def fit_predict(self, X, y=None): return self.fit(X).predict(X) +class OutlierResamplerMixin: + """Mixin class for all outlier detection resamplers in scikit-learn. Child + classes remove outliers from the passed samples. + """ + _estimator_type = "outlier_resampler" + + def fit_resample(self, X, y, props=None): + """Performs fit on X and returns new X and y consisting of only the + inliers. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Input data X. + + y : ndarray, shape (n_samples,) + Input data y. + + props : dict of ndarrays, each ndarray has shape (n_samples,), optional + dict of params that are passed to fit. + + Returns + ------- + X : ndarray, shape (n_samples, n_features) + The input X with outlier samples removed. + + y : ndarray, shape (n_samples,) + The input y with outlier samples removed. + + props : dict of ndarrays, each ndarray has shape (n_samples,) + `props`, but outlier samples are removed for each each parameter. + """ + + if props is not None: + raise NotImplementedError('props is not supported for now') + if props is None: + props = {} + + # filter out unrequired args + required_props = filter(lambda x : x in inspect.signature(super().fit), + props.keys()) + filtered_props = {k : props[k] for k in required_props} + + y = self.fit_predict(X) + + props = {prop[y == 1] for prop in props} + + return X[y == 1], y[y == 1], props + + class MetaEstimatorMixin: _required_parameters = ["estimator"] """Mixin class for all meta estimators in scikit-learn.""" diff --git a/sklearn/resample/__init__.py b/sklearn/resample/__init__.py new file mode 100644 index 0000000000000..df1e9fe81ad7b --- /dev/null +++ b/sklearn/resample/__init__.py @@ -0,0 +1,6 @@ +""" +The :mod:`sklearn.resample` module includes resampling algorithms. +""" + +from .outlier_resample import EllipticEnvelopeResampler, OneClassSVMResampler, LocalOutlierFactorResampler, IsolationForestResampler +__all__ = ["EllipticEnvelopeResampler"] diff --git a/sklearn/resample/outlier_resample.py b/sklearn/resample/outlier_resample.py new file mode 100644 index 0000000000000..47d542cfa5028 --- /dev/null +++ b/sklearn/resample/outlier_resample.py @@ -0,0 +1,29 @@ +from ..base import OutlierResamplerMixin +from ..covariance import EllipticEnvelope +from ..svm import OneClassSVM +from ..ensemble import IsolationForest +from ..neighbors import LocalOutlierFactor + + +class EllipticEnvelopeResampler(EllipticEnvelope, OutlierResamplerMixin): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + +class OneClassSVMResampler(OneClassSVM, OutlierResamplerMixin): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + +class IsolationForestResampler(IsolationForest, OutlierResamplerMixin): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + +class LocalOutlierFactorResampler(LocalOutlierFactor, OutlierResamplerMixin): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) diff --git a/sklearn/resample/tests/test_outlier_resample.py b/sklearn/resample/tests/test_outlier_resample.py new file mode 100644 index 0000000000000..4a16ac101d61d --- /dev/null +++ b/sklearn/resample/tests/test_outlier_resample.py @@ -0,0 +1,25 @@ +import pytest +import numpy as np +from sklearn.resample import EllipticEnvelopeResampler, OneClassSVMResampler, LocalOutlierFactorResampler, IsolationForestResampler +from sklearn.covariance import EllipticEnvelope +from sklearn.svm import OneClassSVM +from sklearn.ensemble import IsolationForest +from sklearn.neighbors import LocalOutlierFactor +from sklearn.datasets import make_blobs + +X, y = make_blobs(random_state=0) + +@pytest.mark.parametrize("detector, superclass", + [(EllipticEnvelopeResampler(), EllipticEnvelope()), + (OneClassSVMResampler(), OneClassSVM()), + (LocalOutlierFactorResampler(), LocalOutlierFactor()), + (IsolationForestResampler(), IsolationForest())]) +def test_basic(detector, superclass): + outliers = superclass.fit_predict(X, y) == -1 + n_outliers = np.sum(outliers) + assert n_outliers > 0 # we have some outliers in the dataset + + X_new, y_new, props_new = detector.fit_resample(X, y) + + assert X_new.shape[0] == X.shape[0] - n_outliers + assert y_new.shape[0] == y.shape[0] - n_outliers From 26c4153e8c916ab646ae686c707cca657eeb1516 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Tue, 26 Feb 2019 17:49:22 +0100 Subject: [PATCH 02/46] Add fit_resample support to pipeline. Many tests from imblearn were adapted. --- sklearn/base.py | 14 +- sklearn/covariance/elliptic_envelope.py | 4 +- sklearn/ensemble/iforest.py | 4 +- sklearn/neighbors/lof.py | 4 +- sklearn/pipeline.py | 109 ++++- sklearn/resample/__init__.py | 6 - sklearn/resample/outlier_resample.py | 29 -- .../resample/tests/test_outlier_resample.py | 25 -- sklearn/svm/classes.py | 5 +- sklearn/tests/test_pipeline.py | 405 +++++++++++++++++- 10 files changed, 510 insertions(+), 95 deletions(-) delete mode 100644 sklearn/resample/__init__.py delete mode 100644 sklearn/resample/outlier_resample.py delete mode 100644 sklearn/resample/tests/test_outlier_resample.py diff --git a/sklearn/base.py b/sklearn/base.py index 0ba642a814e47..f1264a49dd929 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -578,20 +578,22 @@ def fit_resample(self, X, y, props=None): """ if props is not None: - raise NotImplementedError('props is not supported for now') + raise NotImplementedError('props is not supported for now') if props is None: props = {} # filter out unrequired args - required_props = filter(lambda x : x in inspect.signature(super().fit), + required_props = filter(lambda x: x in inspect.signature(super().fit), props.keys()) - filtered_props = {k : props[k] for k in required_props} + filtered_props = {k: props[k] for k in required_props} - y = self.fit_predict(X) + inliers = self.fit_predict(X) == 1 - props = {prop[y == 1] for prop in props} + props = {prop[inliers == 1] for prop in props} - return X[y == 1], y[y == 1], props + if props: + return X[inliers], y[inliers], props + return X[inliers], y[inliers] class MetaEstimatorMixin: diff --git a/sklearn/covariance/elliptic_envelope.py b/sklearn/covariance/elliptic_envelope.py index 86ef3c7dcf921..8f94e61c2f4da 100644 --- a/sklearn/covariance/elliptic_envelope.py +++ b/sklearn/covariance/elliptic_envelope.py @@ -7,10 +7,10 @@ from . import MinCovDet from ..utils.validation import check_is_fitted, check_array from ..metrics import accuracy_score -from ..base import OutlierMixin +from ..base import OutlierMixin, OutlierResamplerMixin -class EllipticEnvelope(MinCovDet, OutlierMixin): +class EllipticEnvelope(MinCovDet, OutlierMixin, OutlierResamplerMixin): """An object for detecting outliers in a Gaussian distributed dataset. Read more in the :ref:`User Guide `. diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index 84649fa97b7fa..28f2033a0e11e 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -12,7 +12,7 @@ from ..utils import check_random_state, check_array from ..utils.fixes import _joblib_parallel_args from ..utils.validation import check_is_fitted -from ..base import OutlierMixin +from ..base import OutlierMixin, OutlierResamplerMixin from .bagging import BaseBagging @@ -21,7 +21,7 @@ INTEGER_TYPES = (numbers.Integral, np.integer) -class IsolationForest(BaseBagging, OutlierMixin): +class IsolationForest(BaseBagging, OutlierMixin, OutlierResamplerMixin): """Isolation Forest Algorithm Return the anomaly score of each sample using the IsolationForest algorithm diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index 5ad2f7e9b7b1d..b0edd9a718176 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -8,7 +8,7 @@ from .base import NeighborsBase from .base import KNeighborsMixin from .base import UnsupervisedMixin -from ..base import OutlierMixin +from ..base import OutlierMixin, OutlierResamplerMixin from ..utils.validation import check_is_fitted from ..utils import check_array @@ -17,7 +17,7 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, - OutlierMixin): + OutlierMixin, OutlierResamplerMixin): """Unsupervised Outlier Detection using Local Outlier Factor (LOF) The anomaly score of each sample is called Local Outlier Factor. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index eeba9857205af..e4a087298089b 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -161,12 +161,21 @@ def _validate_steps(self): for t in transformers: if t is None or t == 'passthrough': continue - if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not - hasattr(t, "transform")): - raise TypeError("All intermediate steps should be " - "transformers and implement fit and transform " - "or be the string 'passthrough' " - "'%s' (type %s) doesn't" % (t, type(t))) + if (not (hasattr(t, "fit") or + hasattr(t, "fit_transform") or + hasattr(t, "fit_resample")) or + not (hasattr(t, "transform") or + hasattr(t, "fit_resample"))): + raise TypeError( + "All intermediate steps of Pipeline should be " + "estimators that implement fit and transform or resample " + "(but not both) '%s' (type %s) doesn't)" % (t, type(t))) + if (hasattr(t, "fit_resample") and (hasattr(t, "fit_transform") or + hasattr(t, "transform"))): + raise TypeError( + "All intermediate steps of the chain should be estimators" + " that implement fit and transform or fit_resample." + " '%s' implements both)" % (t)) # We allow last estimator to be None as an identity transformation if (estimator is not None and estimator != 'passthrough' @@ -212,6 +221,7 @@ def _fit(self, X, y=None, **fit_params): memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) + fit_resample_one_cached = memory.cache(_fit_resample_one) fit_params_steps = {name: {} for name, step in self.steps if step is not None} @@ -219,6 +229,7 @@ def _fit(self, X, y=None, **fit_params): step, param = pname.split('__', 1) fit_params_steps[step][param] = pval Xt = X + yt = y for step_idx, name, transformer in self._iter(with_final=False): if hasattr(memory, 'location'): # joblib >= 0.12 @@ -239,16 +250,21 @@ def _fit(self, X, y=None, **fit_params): else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer - Xt, fitted_transformer = fit_transform_one_cached( - cloned_transformer, Xt, y, None, - **fit_params_steps[name]) + if (hasattr(cloned_transformer, "transform") or + hasattr(cloned_transformer, "fit_transform")): + Xt, fitted_transformer = fit_transform_one_cached( + cloned_transformer, Xt, yt, None, + **fit_params_steps[name]) + elif hasattr(cloned_transformer, "fit_resample"): + Xt, yt, fitted_transformer = fit_resample_one_cached( + cloned_transformer, Xt, yt, **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator == 'passthrough': - return Xt, {} - return Xt, fit_params_steps[self.steps[-1][0]] + return Xt, yt, {} + return Xt, yt, fit_params_steps[self.steps[-1][0]] def fit(self, X, y=None, **fit_params): """Fit the model @@ -276,9 +292,9 @@ def fit(self, X, y=None, **fit_params): self : Pipeline This estimator """ - Xt, fit_params = self._fit(X, y, **fit_params) + Xt, yt, fit_params = self._fit(X, y, **fit_params) if self._final_estimator != 'passthrough': - self._final_estimator.fit(Xt, y, **fit_params) + self._final_estimator.fit(Xt, yt, **fit_params) return self def fit_transform(self, X, y=None, **fit_params): @@ -309,13 +325,51 @@ def fit_transform(self, X, y=None, **fit_params): Transformed samples """ last_step = self._final_estimator - Xt, fit_params = self._fit(X, y, **fit_params) + Xt, yt, fit_params = self._fit(X, y, **fit_params) if hasattr(last_step, 'fit_transform'): - return last_step.fit_transform(Xt, y, **fit_params) + return last_step.fit_transform(Xt, yt, **fit_params) elif last_step == 'passthrough': return Xt else: - return last_step.fit(Xt, y, **fit_params).transform(Xt) + return last_step.fit(Xt, yt, **fit_params).transform(Xt) + + @if_delegate_has_method(delegate='_final_estimator') + def fit_resample(self, X, y=None, **fit_params): + """Fit the model and sample with the final estimator + + Fits all the transformers/samplers one after the other and + transform/sample the data, then uses fit_resample on transformed + data with the final estimator. + + Parameters + ---------- + X : iterable + Training data. Must fulfill input requirements of first step of the + pipeline. + + y : iterable, default=None + Training targets. Must fulfill label requirements for all steps of + the pipeline. + + **fit_params : dict of string -> object + Parameters passed to the ``fit`` method of each step, where + each parameter name is prefixed such that parameter ``p`` for step + ``s`` has key ``s__p``. + + Returns + ------- + Xt : array-like, shape = [n_samples, n_transformed_features] + Transformed samples + + yt : array-like, shape = [n_samples, n_transformed_features] + Transformed target + + """ + last_step = self._final_estimator + Xt, yt, fit_params = self._fit(X, y, **fit_params) + if last_step == 'passthrough': + return Xt + return last_step.fit_resample(Xt, yt, **fit_params) @if_delegate_has_method(delegate='_final_estimator') def predict(self, X, **predict_params): @@ -341,6 +395,8 @@ def predict(self, X, **predict_params): """ Xt = X for _, name, transform in self._iter(with_final=False): + if hasattr(transform, "fit_resample"): + continue Xt = transform.transform(Xt) return self.steps[-1][-1].predict(Xt, **predict_params) @@ -371,8 +427,8 @@ def fit_predict(self, X, y=None, **fit_params): ------- y_pred : array-like """ - Xt, fit_params = self._fit(X, y, **fit_params) - return self.steps[-1][-1].fit_predict(Xt, y, **fit_params) + Xt, yt, fit_params = self._fit(X, y, **fit_params) + return self.steps[-1][-1].fit_predict(Xt, yt, **fit_params) @if_delegate_has_method(delegate='_final_estimator') def predict_proba(self, X): @@ -390,6 +446,8 @@ def predict_proba(self, X): """ Xt = X for _, name, transform in self._iter(with_final=False): + if hasattr(transform, "fit_resample"): + continue Xt = transform.transform(Xt) return self.steps[-1][-1].predict_proba(Xt) @@ -409,6 +467,8 @@ def decision_function(self, X): """ Xt = X for _, name, transform in self._iter(with_final=False): + if hasattr(transform, "fit_resample"): + continue Xt = transform.transform(Xt) return self.steps[-1][-1].decision_function(Xt) @@ -428,6 +488,8 @@ def predict_log_proba(self, X): """ Xt = X for _, name, transform in self._iter(with_final=False): + if hasattr(transform, "fit_resample"): + continue Xt = transform.transform(Xt) return self.steps[-1][-1].predict_log_proba(Xt) @@ -457,6 +519,8 @@ def transform(self): def _transform(self, X): Xt = X for _, _, transform in self._iter(): + if hasattr(transform, "fit_resample"): + continue Xt = transform.transform(Xt) return Xt @@ -481,6 +545,8 @@ def inverse_transform(self): # raise AttributeError if necessary for hasattr behaviour # XXX: Handling the None case means we can't use if_delegate_has_method for _, _, transform in self._iter(): + if hasattr(transform, "fit_resample"): + continue transform.inverse_transform return self._inverse_transform @@ -515,6 +581,8 @@ def score(self, X, y=None, sample_weight=None): """ Xt = X for _, name, transform in self._iter(with_final=False): + if hasattr(transform, "fit_resample"): + continue Xt = transform.transform(Xt) score_params = {} if sample_weight is not None: @@ -630,6 +698,11 @@ def _fit_transform_one(transformer, X, y, weight, **fit_params): return res, transformer return res * weight, transformer +def _fit_resample_one(sampler, X, y, **fit_params): + X_res, y_res = sampler.fit_resample(X, y, **fit_params) + + return X_res, y_res, sampler + class FeatureUnion(_BaseComposition, TransformerMixin): """Concatenates results of multiple transformer objects. diff --git a/sklearn/resample/__init__.py b/sklearn/resample/__init__.py deleted file mode 100644 index df1e9fe81ad7b..0000000000000 --- a/sklearn/resample/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -""" -The :mod:`sklearn.resample` module includes resampling algorithms. -""" - -from .outlier_resample import EllipticEnvelopeResampler, OneClassSVMResampler, LocalOutlierFactorResampler, IsolationForestResampler -__all__ = ["EllipticEnvelopeResampler"] diff --git a/sklearn/resample/outlier_resample.py b/sklearn/resample/outlier_resample.py deleted file mode 100644 index 47d542cfa5028..0000000000000 --- a/sklearn/resample/outlier_resample.py +++ /dev/null @@ -1,29 +0,0 @@ -from ..base import OutlierResamplerMixin -from ..covariance import EllipticEnvelope -from ..svm import OneClassSVM -from ..ensemble import IsolationForest -from ..neighbors import LocalOutlierFactor - - -class EllipticEnvelopeResampler(EllipticEnvelope, OutlierResamplerMixin): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - -class OneClassSVMResampler(OneClassSVM, OutlierResamplerMixin): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - -class IsolationForestResampler(IsolationForest, OutlierResamplerMixin): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - -class LocalOutlierFactorResampler(LocalOutlierFactor, OutlierResamplerMixin): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) diff --git a/sklearn/resample/tests/test_outlier_resample.py b/sklearn/resample/tests/test_outlier_resample.py deleted file mode 100644 index 4a16ac101d61d..0000000000000 --- a/sklearn/resample/tests/test_outlier_resample.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest -import numpy as np -from sklearn.resample import EllipticEnvelopeResampler, OneClassSVMResampler, LocalOutlierFactorResampler, IsolationForestResampler -from sklearn.covariance import EllipticEnvelope -from sklearn.svm import OneClassSVM -from sklearn.ensemble import IsolationForest -from sklearn.neighbors import LocalOutlierFactor -from sklearn.datasets import make_blobs - -X, y = make_blobs(random_state=0) - -@pytest.mark.parametrize("detector, superclass", - [(EllipticEnvelopeResampler(), EllipticEnvelope()), - (OneClassSVMResampler(), OneClassSVM()), - (LocalOutlierFactorResampler(), LocalOutlierFactor()), - (IsolationForestResampler(), IsolationForest())]) -def test_basic(detector, superclass): - outliers = superclass.fit_predict(X, y) == -1 - n_outliers = np.sum(outliers) - assert n_outliers > 0 # we have some outliers in the dataset - - X_new, y_new, props_new = detector.fit_resample(X, y) - - assert X_new.shape[0] == X.shape[0] - n_outliers - assert y_new.shape[0] == y.shape[0] - n_outliers diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index 6ef9215e909f2..1024da5a4d7dd 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -2,7 +2,8 @@ import numpy as np from .base import _fit_liblinear, BaseSVC, BaseLibSVM -from ..base import BaseEstimator, RegressorMixin, OutlierMixin +from ..base import (BaseEstimator, RegressorMixin, OutlierMixin, + OutlierResamplerMixin) from ..linear_model.base import LinearClassifierMixin, SparseCoefMixin, \ LinearModel from ..utils import check_X_y @@ -1039,7 +1040,7 @@ def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3, verbose=verbose, max_iter=max_iter, random_state=None) -class OneClassSVM(BaseLibSVM, OutlierMixin): +class OneClassSVM(BaseLibSVM, OutlierMixin, OutlierResamplerMixin): """Unsupervised Outlier Detection. Estimate the support of a high-dimensional distribution. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 259876acd1a42..4222e3f577229 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1,12 +1,16 @@ """ Test the pipeline module. """ + +# lots of resample tests were taken from imblearn + from distutils.version import LooseVersion from tempfile import mkdtemp import shutil import time import pytest +from pytest import raises import numpy as np from scipy import sparse @@ -15,12 +19,14 @@ from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal +from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_dict_equal from sklearn.utils.testing import assert_no_warnings from sklearn.base import clone, BaseEstimator -from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union +from sklearn.pipeline import Pipeline, make_pipeline +from sklearn.pipeline import FeatureUnion, make_union from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression, Lasso from sklearn.linear_model import LinearRegression @@ -28,12 +34,16 @@ from sklearn.feature_selection import SelectKBest, f_classif from sklearn.dummy import DummyRegressor from sklearn.decomposition import PCA, TruncatedSVD -from sklearn.datasets import load_iris +from sklearn.datasets import load_iris, make_classification from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction.text import CountVectorizer from sklearn.utils._joblib import Memory from sklearn.utils._joblib import __version__ as joblib_version +from sklearn.covariance import EllipticEnvelope +from sklearn.ensemble import IsolationForest + +R_TOL = 1e-4 JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", @@ -154,6 +164,34 @@ def predict(self, X, got_attribute=False): return self +class DummyResampler(NoTrans): + """Resampler which returns the same samples""" + + def fit_resample(self, X, y): + self.means_ = np.mean(X, axis=0) + # store timestamp to figure out whether the result of 'fit' has been + # cached or not + self.timestamp_ = time.time() + return X, y + + +class FitTransformResample(NoTrans): + """Estimator implementing both transform and sample + """ + + def fit(self, X, y, should_succeed=False): + pass + + def fit_resample(self, X, y=None): + return X, y + + def fit_transform(self, X, y=None): + return self.fit(X, y).transform(X) + + def transform(self, X, y=None): + return X + + def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) @@ -186,7 +224,7 @@ def test_pipeline_init(): # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform assert_raises_regex(TypeError, - 'All intermediate steps should be transformers' + 'All intermediate steps of Pipeline ' '.*\\bNoTrans\\b.*', Pipeline, [('t', NoTrans()), ('svc', clf)]) @@ -1035,6 +1073,367 @@ def test_pipeline_memory(): shutil.rmtree(cachedir) +def test_pipeline_memory_resampler(): + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + cachedir = mkdtemp() + try: + memory = Memory(cachedir, verbose=10) + # Test with Transformer + SVC + clf = SVC(gamma='scale', probability=True, random_state=0) + transf = DummyResampler() + pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) + cached_pipe = Pipeline( + [('transf', transf), ('svc', clf)], memory=memory) + + # Memoize the transformer at the first fit + cached_pipe.fit(X, y) + pipe.fit(X, y) + # Get the time stamp of the tranformer in the cached pipeline + expected_ts = cached_pipe.named_steps['transf'].timestamp_ + # Check that cached_pipe and pipe yield identical results + assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) + assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) + assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) + assert_array_equal(pipe.named_steps['transf'].means_, + cached_pipe.named_steps['transf'].means_) + assert not hasattr(transf, 'means_') + # Check that we are reading the cache while fitting + # a second time + cached_pipe.fit(X, y) + # Check that cached_pipe and pipe yield identical results + assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) + assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) + assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) + assert_array_equal(pipe.named_steps['transf'].means_, + cached_pipe.named_steps['transf'].means_) + assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts + # Create a new pipeline with cloned estimators + # Check that even changing the name step does not affect the cache hit + clf_2 = SVC(gamma='scale', probability=True, random_state=0) + transf_2 = DummyResampler() + cached_pipe_2 = Pipeline( + [('transf_2', transf_2), ('svc', clf_2)], memory=memory) + cached_pipe_2.fit(X, y) + + # Check that cached_pipe and pipe yield identical results + assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) + assert_array_equal( + pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) + assert_array_equal( + pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) + assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) + assert_array_equal(pipe.named_steps['transf'].means_, + cached_pipe_2.named_steps['transf_2'].means_) + assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts + finally: + shutil.rmtree(cachedir) + + +def test_pipeline_methods_pca_outlier_svm(): + # Test the various methods of the pipeline (pca + svm). + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + + # Test with PCA + SVC + clf = SVC(gamma='scale', probability=True, random_state=0) + pca = PCA() + outlier = EllipticEnvelope(random_state=0) + pipe = Pipeline([('pca', pca), ('outlier', outlier), ('svc', clf)]) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.predict_log_proba(X) + pipe.score(X, y) + + +def test_pipeline_methods_outlier_pca_svm(): + # Test the various methods of the pipeline (pca + svm). + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + + # Test with PCA + SVC + clf = SVC(gamma='scale', probability=True, random_state=0) + pca = PCA() + outlier = EllipticEnvelope(random_state=0) + pipe = Pipeline([('outlier', outlier), ('pca', pca), ('svc', clf)]) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.predict_log_proba(X) + pipe.score(X, y) + + +def test_pipeline_resample(): + # Test whether pipeline works with a resampler at the end. + # Also test pipeline.fit_resample + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + + resampler = EllipticEnvelope(random_state=0) + pipeline = Pipeline([('resampler', resampler)]) + + # test transform and fit_transform: + X_trans, y_trans = pipeline.fit_resample(X, y) + X_trans2, y_trans2 = resampler.fit_resample(X, y) + assert_allclose(X_trans, X_trans2, rtol=R_TOL) + assert_allclose(y_trans, y_trans2, rtol=R_TOL) + + pca = PCA() + pipeline = Pipeline([('pca', PCA()), ('resampler', resampler)]) + + X_trans, y_trans = pipeline.fit_resample(X, y) + X_pca = pca.fit_transform(X) + X_trans2, y_trans2 = resampler.fit_resample(X_pca, y) + assert_allclose(X_trans, X_trans2, rtol=R_TOL) + assert_allclose(y_trans, y_trans2, rtol=R_TOL) + + +def test_pipeline_none_classifier(): + # Test pipeline using None as preprocessing step and a classifier + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + clf = LogisticRegression(solver='lbfgs', random_state=0) + pipe = make_pipeline(None, clf) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.decision_function(X) + pipe.score(X, y) + + +def test_pipeline_none_resampler_classifier(): + # Test pipeline using None, an OutlierResampler and a classifier + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + clf = LogisticRegression(solver='lbfgs', random_state=0) + outlier = EllipticEnvelope(random_state=0) + pipe = make_pipeline(None, outlier, clf) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.decision_function(X) + pipe.score(X, y) + + +def test_pipeline_resampler_none_classifier(): + # Test pipeline using an OutlierResampler, None and a classifier + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + clf = LogisticRegression(solver='lbfgs', random_state=0) + outlier = EllipticEnvelope(random_state=0) + pipe = make_pipeline(outlier, None, clf) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.decision_function(X) + pipe.score(X, y) + + +def test_pipeline_none_resampler_resample(): + # Test pipeline using None step and a resampler + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + + outlier = EllipticEnvelope(random_state=0) + pipe = make_pipeline(None, outlier) + pipe.fit_resample(X, y) + + +def test_pipeline_none_transformer(): + # Test pipeline using None and a transformer that implements transform and + # inverse_transform + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + + pca = PCA(whiten=True) + pipe = make_pipeline(None, pca) + pipe.fit(X, y) + X_trans = pipe.transform(X) + X_inversed = pipe.inverse_transform(X_trans) + assert_array_almost_equal(X, X_inversed) + + +def test_pipeline_methods_anova_rus(): + # Test the various methods of the pipeline (anova). + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + # Test with RandomUnderSampling + Anova + LogisticRegression + clf = LogisticRegression(solver='lbfgs') + outlier = EllipticEnvelope(random_state=0) + filter1 = SelectKBest(f_classif, k=2) + pipe = Pipeline([('outlier', outlier), + ('anova', filter1), + ('logistic', clf)]) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.predict_log_proba(X) + pipe.score(X, y) + + +def test_pipeline_with_step_that_implements_both_sample_and_transform(): + # Test the various methods of the pipeline (anova). + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + + clf = LogisticRegression(solver='lbfgs') + with raises(TypeError): + Pipeline([('step', FitTransformResample()), ('logistic', clf)]) + + +def test_pipeline_fit_then_sample_with_resampler_last_estimator(): + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + + outlier1 = EllipticEnvelope(random_state=0) + outlier2 = IsolationForest(random_state=0) + pipeline = make_pipeline(outlier1, outlier2) + X_fit_resample_resampled, y_fit_resample_resampled = \ + pipeline.fit_resample(X, y) + pipeline = make_pipeline(outlier1, outlier2) + pipeline.fit(X, y) + X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) + assert_array_equal(X_fit_resample_resampled, X_fit_then_sample_res) + assert_array_equal(y_fit_resample_resampled, y_fit_then_sample_res) + + +def test_pipeline_fit_then_sample_3_resamplers_with_resampler_last_estimator(): + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0) + + outlier1 = EllipticEnvelope(random_state=0) + outlier2 = IsolationForest(random_state=0) + pipeline = make_pipeline(outlier2, outlier1, outlier2) + X_fit_resample, y_fit_resample = pipeline.fit_resample(X, y) + pipeline.fit(X, y) + X_fit_then_resample, y_fit_then_resample = pipeline.fit_resample(X, y) + + assert_array_equal(X_fit_resample, X_fit_then_resample) + assert_array_equal(y_fit_resample, y_fit_then_resample) + + def test_make_pipeline_memory(): cachedir = mkdtemp() if LooseVersion(joblib_version) < LooseVersion('0.12'): From 60708f4b7a92832317e1d54e4a046342446c83ce Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Tue, 26 Feb 2019 18:44:04 +0100 Subject: [PATCH 03/46] Remove resample module --- sklearn/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sklearn/__init__.py b/sklearn/__init__.py index 2de97d0e30843..aafc8a34b2a13 100644 --- a/sklearn/__init__.py +++ b/sklearn/__init__.py @@ -73,9 +73,8 @@ 'kernel_ridge', 'linear_model', 'manifold', 'metrics', 'mixture', 'model_selection', 'multiclass', 'multioutput', 'naive_bayes', 'neighbors', 'neural_network', 'pipeline', - 'preprocessing', 'random_projection', 'resample', - 'semi_supervised','svm', 'tree', 'discriminant_analysis', - 'impute', 'compose', + 'preprocessing', 'random_projection', 'semi_supervised', + 'svm', 'tree', 'discriminant_analysis', 'impute', 'compose', # Non-modules: 'clone', 'get_config', 'set_config', 'config_context', 'show_versions'] From 630ba2b676574a3e8891948537cfbd33af41f6a5 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 27 Feb 2019 16:16:04 +0100 Subject: [PATCH 04/46] More tests, more general _iter --- sklearn/pipeline.py | 69 +++++++++++++---------------- sklearn/tests/test_pipeline.py | 81 +++++++++++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 39 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index e4a087298089b..36e6e1f92b3fc 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -8,9 +8,9 @@ # Alexandre Gramfort # Lars Buitinck # License: BSD - from collections import defaultdict from itertools import islice +from functools import partial import numpy as np from scipy import sparse @@ -157,10 +157,13 @@ def _validate_steps(self): # validate estimators transformers = estimators[:-1] estimator = estimators[-1] + self._resamplers_exist = False for t in transformers: if t is None or t == 'passthrough': continue + if hasattr(t, "fit_resample"): + self._resamplers_exist = True if (not (hasattr(t, "fit") or hasattr(t, "fit_transform") or hasattr(t, "fit_resample")) or @@ -185,7 +188,8 @@ def _validate_steps(self): "or be the string 'passthrough'. " "'%s' (type %s) doesn't" % (estimator, type(estimator))) - def _iter(self, with_final=True): + + def _iter(self, with_final=True, with_resamplers=True): """ Generate (name, trans) tuples excluding 'passthrough' transformers """ @@ -194,9 +198,13 @@ def _iter(self, with_final=True): stop -= 1 for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)): - if trans is not None and trans != 'passthrough': + if trans is not None and ((trans != 'passthrough' and not + hasattr(trans, 'fit_resample')) or + (with_resamplers and + hasattr(trans, 'fit_resample'))): yield idx, name, trans + @property def _estimator_type(self): return self.steps[-1][1]._estimator_type @@ -333,7 +341,6 @@ def fit_transform(self, X, y=None, **fit_params): else: return last_step.fit(Xt, yt, **fit_params).transform(Xt) - @if_delegate_has_method(delegate='_final_estimator') def fit_resample(self, X, y=None, **fit_params): """Fit the model and sample with the final estimator @@ -367,9 +374,12 @@ def fit_resample(self, X, y=None, **fit_params): """ last_step = self._final_estimator Xt, yt, fit_params = self._fit(X, y, **fit_params) - if last_step == 'passthrough': - return Xt - return last_step.fit_resample(Xt, yt, **fit_params) + if hasattr(last_step, 'fit_resample'): + return last_step.fit_resample(Xt, yt, **fit_params) + elif last_step == 'passthrough': + return Xt, yt + else: + return last_step.fit_resample(Xt, yt, **fit_params) @if_delegate_has_method(delegate='_final_estimator') def predict(self, X, **predict_params): @@ -393,11 +403,7 @@ def predict(self, X, **predict_params): ------- y_pred : array-like """ - Xt = X - for _, name, transform in self._iter(with_final=False): - if hasattr(transform, "fit_resample"): - continue - Xt = transform.transform(Xt) + Xt = self._transform(X, with_final=False, with_resamplers=False) return self.steps[-1][-1].predict(Xt, **predict_params) @if_delegate_has_method(delegate='_final_estimator') @@ -427,6 +433,10 @@ def fit_predict(self, X, y=None, **fit_params): ------- y_pred : array-like """ + if self._resamplers_exist: + raise NotImplementedError("Pipelines containing resamplers that" + " have an estimator implementing fit_predict as their last stage " + "are currently not supported.") Xt, yt, fit_params = self._fit(X, y, **fit_params) return self.steps[-1][-1].fit_predict(Xt, yt, **fit_params) @@ -444,11 +454,7 @@ def predict_proba(self, X): ------- y_proba : array-like, shape = [n_samples, n_classes] """ - Xt = X - for _, name, transform in self._iter(with_final=False): - if hasattr(transform, "fit_resample"): - continue - Xt = transform.transform(Xt) + Xt = self._transform(X, with_resamplers=False, with_final=False) return self.steps[-1][-1].predict_proba(Xt) @if_delegate_has_method(delegate='_final_estimator') @@ -465,11 +471,7 @@ def decision_function(self, X): ------- y_score : array-like, shape = [n_samples, n_classes] """ - Xt = X - for _, name, transform in self._iter(with_final=False): - if hasattr(transform, "fit_resample"): - continue - Xt = transform.transform(Xt) + Xt = self._transform(X, with_final=False, with_resamplers=False) return self.steps[-1][-1].decision_function(Xt) @if_delegate_has_method(delegate='_final_estimator') @@ -486,11 +488,7 @@ def predict_log_proba(self, X): ------- y_score : array-like, shape = [n_samples, n_classes] """ - Xt = X - for _, name, transform in self._iter(with_final=False): - if hasattr(transform, "fit_resample"): - continue - Xt = transform.transform(Xt) + Xt = self._transform(X, with_final=False, with_resamplers=False) return self.steps[-1][-1].predict_log_proba(Xt) @property @@ -514,13 +512,12 @@ def transform(self): # XXX: Handling the None case means we can't use if_delegate_has_method if self._final_estimator != 'passthrough': self._final_estimator.transform - return self._transform + return partial(self._transform, with_final=True, with_resamplers=True) - def _transform(self, X): + def _transform(self, X, with_resamplers=True, with_final=True): Xt = X - for _, _, transform in self._iter(): - if hasattr(transform, "fit_resample"): - continue + for _, _, transform in self._iter(with_final=with_final, + with_resamplers=with_resamplers): Xt = transform.transform(Xt) return Xt @@ -552,7 +549,7 @@ def inverse_transform(self): def _inverse_transform(self, X): Xt = X - reverse_iter = reversed(list(self._iter())) + reverse_iter = reversed(list(self._iter(with_resamplers=False))) for _, _, transform in reverse_iter: Xt = transform.inverse_transform(Xt) return Xt @@ -579,11 +576,7 @@ def score(self, X, y=None, sample_weight=None): ------- score : float """ - Xt = X - for _, name, transform in self._iter(with_final=False): - if hasattr(transform, "fit_resample"): - continue - Xt = transform.transform(Xt) + Xt = self._transform(X, with_final=False, with_resamplers=False) score_params = {} if sample_weight is not None: score_params['sample_weight'] = sample_weight diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 4222e3f577229..fce084c4faa38 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1339,7 +1339,7 @@ def test_pipeline_none_transformer(): assert_array_almost_equal(X, X_inversed) -def test_pipeline_methods_anova_rus(): +def test_pipeline_methods_anova_outlier(): # Test the various methods of the pipeline (anova). X, y = make_classification( n_classes=2, @@ -1366,6 +1366,7 @@ def test_pipeline_methods_anova_rus(): pipe.score(X, y) + def test_pipeline_with_step_that_implements_both_sample_and_transform(): # Test the various methods of the pipeline (anova). X, y = make_classification( @@ -1447,3 +1448,81 @@ def test_make_pipeline_memory(): assert pipeline.memory is None shutil.rmtree(cachedir) + +def test_shape_correct_after_resample(): + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=50, + random_state=0) + + ell = EllipticEnvelope(random_state=0) + pipe = make_pipeline (ell, None) + + outliers = ell.fit_predict(X, y) == -1 + n_outliers = np.sum(outliers) + assert n_outliers > 0 # we have some outliers in the dataset + + X_new, y_new = pipe.fit_resample(X, y) + + assert X_new.shape[0] == X.shape[0] - n_outliers + assert y_new.shape[0] == y.shape[0] - n_outliers + +def test_resamplers_not_called(): + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=50, + random_state=0) + + mul2 = Mult(2) + dre = DummyResampler() + mul3 = Mult(3) + + pipe = make_pipeline(mul2, dre, mul3) + pipe.fit(X, y) + + assert hasattr(dre, "means_") + delattr(dre, "means_") + + pipe.predict(X) + assert not hasattr(dre, "means_") + + pipe.fit_transform(X, y) + assert hasattr(dre, "means_") + delattr(dre, "means_") + + pipe.fit(X, y) + assert hasattr(dre, "means_") + +def test_clusterer_and_resampler_error(): + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=10, + random_state=0) + + dre = DummyResampler() + pipe = make_pipeline(dre, KMeans()) + msg = "have an estimator implementing fit_predict as their last stage" + with pytest.raises(NotImplementedError, + match=msg): + pipe.fit_predict(X, y) From 96df7ab43a1b5b326fbe3b7847615e65c429c88f Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 27 Feb 2019 16:34:21 +0100 Subject: [PATCH 05/46] Fix flake --- sklearn/pipeline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 36e6e1f92b3fc..8439cef4bf3be 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -188,7 +188,6 @@ def _validate_steps(self): "or be the string 'passthrough'. " "'%s' (type %s) doesn't" % (estimator, type(estimator))) - def _iter(self, with_final=True, with_resamplers=True): """ Generate (name, trans) tuples excluding 'passthrough' transformers @@ -204,7 +203,6 @@ def _iter(self, with_final=True, with_resamplers=True): hasattr(trans, 'fit_resample'))): yield idx, name, trans - @property def _estimator_type(self): return self.steps[-1][1]._estimator_type @@ -435,8 +433,9 @@ def fit_predict(self, X, y=None, **fit_params): """ if self._resamplers_exist: raise NotImplementedError("Pipelines containing resamplers that" - " have an estimator implementing fit_predict as their last stage " - "are currently not supported.") + " have an estimator implementing " + "fit_predict as their last stage " + "are currently not supported.") Xt, yt, fit_params = self._fit(X, y, **fit_params) return self.steps[-1][-1].fit_predict(Xt, yt, **fit_params) @@ -691,6 +690,7 @@ def _fit_transform_one(transformer, X, y, weight, **fit_params): return res, transformer return res * weight, transformer + def _fit_resample_one(sampler, X, y, **fit_params): X_res, y_res = sampler.fit_resample(X, y, **fit_params) From 638d147cea92d6fcf05bf0ece446dd33d106ddb1 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 27 Feb 2019 17:32:10 +0100 Subject: [PATCH 06/46] Remove some warnings --- sklearn/tests/test_pipeline.py | 38 ++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index fce084c4faa38..88fe349edd9d0 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -40,8 +40,8 @@ from sklearn.utils._joblib import Memory from sklearn.utils._joblib import __version__ as joblib_version -from sklearn.covariance import EllipticEnvelope -from sklearn.ensemble import IsolationForest +from sklearn.svm import OneClassSVM +from sklearn.neighbors import LocalOutlierFactor R_TOL = 1e-4 @@ -1160,7 +1160,7 @@ def test_pipeline_methods_pca_outlier_svm(): # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA() - outlier = EllipticEnvelope(random_state=0) + outlier = OneClassSVM(gamma='scale') pipe = Pipeline([('pca', pca), ('outlier', outlier), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) @@ -1186,7 +1186,7 @@ def test_pipeline_methods_outlier_pca_svm(): # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) pca = PCA() - outlier = EllipticEnvelope(random_state=0) + outlier = OneClassSVM(gamma='scale') pipe = Pipeline([('outlier', outlier), ('pca', pca), ('svc', clf)]) pipe.fit(X, y) pipe.predict(X) @@ -1210,7 +1210,7 @@ def test_pipeline_resample(): n_samples=500, random_state=0) - resampler = EllipticEnvelope(random_state=0) + resampler = OneClassSVM(gamma='scale') pipeline = Pipeline([('resampler', resampler)]) # test transform and fit_transform: @@ -1265,7 +1265,7 @@ def test_pipeline_none_resampler_classifier(): n_samples=500, random_state=0) clf = LogisticRegression(solver='lbfgs', random_state=0) - outlier = EllipticEnvelope(random_state=0) + outlier = OneClassSVM(gamma='scale') pipe = make_pipeline(None, outlier, clf) pipe.fit(X, y) pipe.predict(X) @@ -1288,7 +1288,7 @@ def test_pipeline_resampler_none_classifier(): n_samples=500, random_state=0) clf = LogisticRegression(solver='lbfgs', random_state=0) - outlier = EllipticEnvelope(random_state=0) + outlier = OneClassSVM(gamma='scale') pipe = make_pipeline(outlier, None, clf) pipe.fit(X, y) pipe.predict(X) @@ -1311,7 +1311,7 @@ def test_pipeline_none_resampler_resample(): n_samples=500, random_state=0) - outlier = EllipticEnvelope(random_state=0) + outlier = OneClassSVM(gamma='scale') pipe = make_pipeline(None, outlier) pipe.fit_resample(X, y) @@ -1354,7 +1354,7 @@ def test_pipeline_methods_anova_outlier(): random_state=0) # Test with RandomUnderSampling + Anova + LogisticRegression clf = LogisticRegression(solver='lbfgs') - outlier = EllipticEnvelope(random_state=0) + outlier = OneClassSVM(gamma='scale') filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('outlier', outlier), ('anova', filter1), @@ -1366,7 +1366,6 @@ def test_pipeline_methods_anova_outlier(): pipe.score(X, y) - def test_pipeline_with_step_that_implements_both_sample_and_transform(): # Test the various methods of the pipeline (anova). X, y = make_classification( @@ -1399,8 +1398,8 @@ def test_pipeline_fit_then_sample_with_resampler_last_estimator(): n_samples=500, random_state=0) - outlier1 = EllipticEnvelope(random_state=0) - outlier2 = IsolationForest(random_state=0) + outlier1 = OneClassSVM(gamma='scale') + outlier2 = LocalOutlierFactor(contamination=0.1) pipeline = make_pipeline(outlier1, outlier2) X_fit_resample_resampled, y_fit_resample_resampled = \ pipeline.fit_resample(X, y) @@ -1424,8 +1423,8 @@ def test_pipeline_fit_then_sample_3_resamplers_with_resampler_last_estimator(): n_samples=500, random_state=0) - outlier1 = EllipticEnvelope(random_state=0) - outlier2 = IsolationForest(random_state=0) + outlier1 = OneClassSVM(gamma='scale') + outlier2 = LocalOutlierFactor(contamination=0.1) pipeline = make_pipeline(outlier2, outlier1, outlier2) X_fit_resample, y_fit_resample = pipeline.fit_resample(X, y) pipeline.fit(X, y) @@ -1449,6 +1448,7 @@ def test_make_pipeline_memory(): shutil.rmtree(cachedir) + def test_shape_correct_after_resample(): X, y = make_classification( n_classes=2, @@ -1462,18 +1462,19 @@ def test_shape_correct_after_resample(): n_samples=50, random_state=0) - ell = EllipticEnvelope(random_state=0) - pipe = make_pipeline (ell, None) + outlier = OneClassSVM(gamma='scale') + pipe = make_pipeline(outlier, None) - outliers = ell.fit_predict(X, y) == -1 + outliers = outlier.fit_predict(X, y) == -1 n_outliers = np.sum(outliers) - assert n_outliers > 0 # we have some outliers in the dataset + assert n_outliers > 0 # we have some outliers in the dataset X_new, y_new = pipe.fit_resample(X, y) assert X_new.shape[0] == X.shape[0] - n_outliers assert y_new.shape[0] == y.shape[0] - n_outliers + def test_resamplers_not_called(): X, y = make_classification( n_classes=2, @@ -1507,6 +1508,7 @@ def test_resamplers_not_called(): pipe.fit(X, y) assert hasattr(dre, "means_") + def test_clusterer_and_resampler_error(): X, y = make_classification( n_classes=2, From ad547747a6ac411dc521dde71e4f55675244e4ec Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 27 Feb 2019 22:16:39 +0100 Subject: [PATCH 07/46] Remove props code --- sklearn/base.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index f1264a49dd929..4bfb6ad349be9 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -579,20 +579,20 @@ def fit_resample(self, X, y, props=None): if props is not None: raise NotImplementedError('props is not supported for now') - if props is None: - props = {} + # if props is None: + # props = {} # filter out unrequired args - required_props = filter(lambda x: x in inspect.signature(super().fit), - props.keys()) - filtered_props = {k: props[k] for k in required_props} + # required_props = filter(lambda x: x in inspect.signature(super().fit), + # props.keys()) + # filtered_props = {k: props[k] for k in required_props} - inliers = self.fit_predict(X) == 1 + # inliers = self.fit_predict(X) == 1 - props = {prop[inliers == 1] for prop in props} + # props = {prop[inliers == 1] for prop in props} - if props: - return X[inliers], y[inliers], props + # if props: + # return X[inliers], y[inliers], props return X[inliers], y[inliers] From 4254a35f2781626f3c6ada2a0cdb5e0b220081cf Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Thu, 28 Feb 2019 12:19:23 +0100 Subject: [PATCH 08/46] Add tests to tests_common.py --- sklearn/utils/estimator_checks.py | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 9ddc8a8036062..fafb39928d876 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -216,6 +216,9 @@ def _yield_outliers_checks(name, estimator): if hasattr(estimator, 'fit_predict'): yield check_outliers_fit_predict + if hasattr(estimator, 'fit_resample'): + yield check_outlier_resamplers + # checks for estimators that can be used on a test set if hasattr(estimator, 'predict'): yield check_outliers_train @@ -226,6 +229,12 @@ def _yield_outliers_checks(name, estimator): yield check_estimators_unfitted +def _yield_resamplers_checks(name, estimator): + yield check_resampler_structure + yield check_resamplers_have_no_transform + yield check_resample_repeated + + def _yield_all_checks(name, estimator): tags = _safe_tags(estimator) if "2darray" not in tags["X_types"]: @@ -250,6 +259,9 @@ def _yield_all_checks(name, estimator): if hasattr(estimator, 'transform'): for check in _yield_transformer_checks(name, estimator): yield check + if hasattr(estimator, 'fit_resample'): + for check in _yield_resamplers_checks(name, estimator): + yield check if isinstance(estimator, ClusterMixin): for check in _yield_clustering_checks(name, estimator): yield check @@ -2470,3 +2482,41 @@ def check_fit_idempotent(name, estimator_orig): if hasattr(estimator, method): new_result = getattr(estimator, method)(X_test) assert_allclose_dense_sparse(result[method], new_result) + + +def check_outlier_resamplers(name, estimator_orig): + X, y = make_blobs(random_state=0) + outliers = estimator_orig.fit_predict(X, y) == -1 + n_outliers = np.sum(outliers) + + X_new, y_new = estimator_orig.fit_resample(X, y) + + assert X_new.shape[0] == X.shape[0] - n_outliers + assert y_new.shape[0] == y.shape[0] - n_outliers + + +def check_resampler_structure(name, estimator_orig): + X, y = make_blobs(n_samples=10) + X_new, y_new = estimator_orig.fit_resample(X, y) + + props = { + 'weight': np.arange(10), + 'other': np.arange(10), + 'more': np.arange(10) + } + X_new, y_new, props_new = estimator_orig.fit_resample(X, y, props) + assert props.key() == props_new.keys() + + +def check_resample_repeated(name, estimator_orig): + X, y = make_blobs(n_samples=10) + X_new, y_new = estimator_orig.fit_resample(X, y) + X_new2, y_new2 = estimator_orig.fit_resample(X, y) + + assert_array_equal(X_new, X_new2) + assert_array_equal(y_new, y_new2) + + +def check_resamplers_have_no_transform(name, estimator_orig): + assert not hasattr(estimator_orig, 'transform') + assert not hasattr(estimator_orig, 'fit_transform') From 561c47a63675d0e56319727765adff2f5833ee0d Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Thu, 28 Feb 2019 12:26:58 +0100 Subject: [PATCH 09/46] Changes to fit_resample --- sklearn/base.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 4bfb6ad349be9..0766a5410c3dc 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -6,7 +6,6 @@ import copy import warnings from collections import defaultdict -import struct import inspect import numpy as np @@ -577,22 +576,12 @@ def fit_resample(self, X, y, props=None): `props`, but outlier samples are removed for each each parameter. """ - if props is not None: - raise NotImplementedError('props is not supported for now') - # if props is None: - # props = {} + inliers = self.fit_predict(X) == 1 - # filter out unrequired args - # required_props = filter(lambda x: x in inspect.signature(super().fit), - # props.keys()) - # filtered_props = {k: props[k] for k in required_props} + if props: + props = {prop[inliers == 1] for prop in props} + return X[inliers], y[inliers], props - # inliers = self.fit_predict(X) == 1 - - # props = {prop[inliers == 1] for prop in props} - - # if props: - # return X[inliers], y[inliers], props return X[inliers], y[inliers] From 33f1fe63c9e1d1074191ca308d203fed958dbd7f Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Thu, 28 Feb 2019 13:59:43 +0100 Subject: [PATCH 10/46] Remove nondeterminism --- sklearn/utils/estimator_checks.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index fafb39928d876..ea2adf1ccc743 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2510,7 +2510,10 @@ def check_resampler_structure(name, estimator_orig): def check_resample_repeated(name, estimator_orig): X, y = make_blobs(n_samples=10) + + set_random_state(estimator_orig, random_state=0) X_new, y_new = estimator_orig.fit_resample(X, y) + set_random_state(estimator_orig, random_state=0) X_new2, y_new2 = estimator_orig.fit_resample(X, y) assert_array_equal(X_new, X_new2) From 4c0aea8d88007cee92e0193caf5914d16f764bd2 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Thu, 28 Feb 2019 18:50:17 +0100 Subject: [PATCH 11/46] Remove `sample_props` and fix pipeline docstrings --- sklearn/base.py | 12 +---- sklearn/pipeline.py | 89 +++++++++++++++++-------------- sklearn/utils/estimator_checks.py | 8 --- 3 files changed, 49 insertions(+), 60 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index d093789561cf3..d8f863d9f7ac8 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -550,7 +550,7 @@ class OutlierResamplerMixin: """ _estimator_type = "outlier_resampler" - def fit_resample(self, X, y, props=None): + def fit_resample(self, X, y): """Performs fit on X and returns new X and y consisting of only the inliers. @@ -562,9 +562,6 @@ def fit_resample(self, X, y, props=None): y : ndarray, shape (n_samples,) Input data y. - props : dict of ndarrays, each ndarray has shape (n_samples,), optional - dict of params that are passed to fit. - Returns ------- X : ndarray, shape (n_samples, n_features) @@ -572,17 +569,10 @@ def fit_resample(self, X, y, props=None): y : ndarray, shape (n_samples,) The input y with outlier samples removed. - - props : dict of ndarrays, each ndarray has shape (n_samples,) - `props`, but outlier samples are removed for each each parameter. """ inliers = self.fit_predict(X) == 1 - if props: - props = {prop[inliers == 1] for prop in props} - return X[inliers], y[inliers], props - return X[inliers], y[inliers] diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 8439cef4bf3be..5b3d4260e3472 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -30,10 +30,10 @@ class Pipeline(_BaseComposition): """Pipeline of transforms with a final estimator. Sequentially apply a list of transforms and a final estimator. - Intermediate steps of the pipeline must be 'transforms', that is, they - must implement fit and transform methods. - The final estimator only needs to implement fit. - The transformers in the pipeline can be cached using ``memory`` argument. + Intermediate steps of the pipeline must be transformers or resamplers, that + is, they must implement `fit` and `transform` methods, or a `fit_resample` + method. The final estimator only needs to implement `fit`. + The transformers in the pipeline can be cached using `memory` argument. The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. @@ -41,7 +41,7 @@ class Pipeline(_BaseComposition): names and the parameter name separated by a '__', as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting - it to 'passthrough' or ``None``. + it to 'passthrough' or `None`. Read more in the :ref:`User Guide `. @@ -275,8 +275,9 @@ def _fit(self, X, y=None, **fit_params): def fit(self, X, y=None, **fit_params): """Fit the model - Fit all the transforms one after the other and transform the - data, then fit the transformed data using the final estimator. + Fit all the transformers/resamplers one after the other and + transform/resample the data, then fit the transformed/resampled data + using the final estimator. Parameters ---------- @@ -306,9 +307,9 @@ def fit(self, X, y=None, **fit_params): def fit_transform(self, X, y=None, **fit_params): """Fit the model and transform with the final estimator - Fits all the transforms one after the other and transforms the - data, then uses fit_transform on transformed data with the final - estimator. + Fits all the transformers/resamplers one after the other and + transforms/resamples the data, then uses fit_transform on the + transformed/resampled data with the final estimator. Parameters ---------- @@ -327,7 +328,7 @@ def fit_transform(self, X, y=None, **fit_params): Returns ------- - Xt : array-like, shape = [n_samples, n_transformed_features] + Xt : array-like, shape = [n_resampled_samples, n_transformed_features] Transformed samples """ last_step = self._final_estimator @@ -342,9 +343,9 @@ def fit_transform(self, X, y=None, **fit_params): def fit_resample(self, X, y=None, **fit_params): """Fit the model and sample with the final estimator - Fits all the transformers/samplers one after the other and - transform/sample the data, then uses fit_resample on transformed - data with the final estimator. + Fits all the transformers/resamplers one after the other and + transforms/resamples the data, then uses fit_resample on the + transformed/resampled data with the final estimator. Parameters ---------- @@ -363,10 +364,10 @@ def fit_resample(self, X, y=None, **fit_params): Returns ------- - Xt : array-like, shape = [n_samples, n_transformed_features] + Xt : array-like, shape = [n_resampled_samples, n_transformed_features] Transformed samples - yt : array-like, shape = [n_samples, n_transformed_features] + yt : array-like, shape = [n_resampled_samples, n_transformed_features] Transformed target """ @@ -381,7 +382,8 @@ def fit_resample(self, X, y=None, **fit_params): @if_delegate_has_method(delegate='_final_estimator') def predict(self, X, **predict_params): - """Apply transforms to the data, and predict with the final estimator + """Apply transforms to the data, and predict with the final estimator. + Note that resamplers are not applied. Parameters ---------- @@ -406,11 +408,11 @@ def predict(self, X, **predict_params): @if_delegate_has_method(delegate='_final_estimator') def fit_predict(self, X, y=None, **fit_params): - """Applies fit_predict of last step in pipeline after transforms. + """Applies fit_transforms of a pipeline to the data, followed by the + fit_predict method of the final estimator in the pipeline. - Applies fit_transforms of a pipeline to the data, followed by the - fit_predict method of the final estimator in the pipeline. Valid - only if the final estimator implements fit_predict. + Calling this method on a pipeline containing a resamplers is + unsupported. Parameters ---------- @@ -441,7 +443,9 @@ def fit_predict(self, X, y=None, **fit_params): @if_delegate_has_method(delegate='_final_estimator') def predict_proba(self, X): - """Apply transforms, and predict_proba of the final estimator + """Apply transforms, and predict_proba of the final estimator. Note + that resamplers are not applied. + Parameters ---------- @@ -458,7 +462,8 @@ def predict_proba(self, X): @if_delegate_has_method(delegate='_final_estimator') def decision_function(self, X): - """Apply transforms, and decision_function of the final estimator + """Apply transforms, and decision_function of the final estimator. Note + that resamplers are not applied. Parameters ---------- @@ -475,7 +480,8 @@ def decision_function(self, X): @if_delegate_has_method(delegate='_final_estimator') def predict_log_proba(self, X): - """Apply transforms, and predict_log_proba of the final estimator + """Apply transforms, and predict_log_proba of the final estimator. Note + that resamplers are not applied. Parameters ---------- @@ -492,9 +498,9 @@ def predict_log_proba(self, X): @property def transform(self): - """Apply transforms, and transform with the final estimator + """Apply transforms/resamples, and transform with the final estimator. - This also works where final estimator is ``None``: all prior + This also works where final estimator is `None`: all prior transformations are applied. Parameters @@ -505,7 +511,7 @@ def transform(self): Returns ------- - Xt : array-like, shape = [n_samples, n_transformed_features] + Xt : array-like, shape = [n_resampled_samples, n_transformed_features] """ # _final_estimator is None or has transform, otherwise attribute error # XXX: Handling the None case means we can't use if_delegate_has_method @@ -522,9 +528,11 @@ def _transform(self, X, with_resamplers=True, with_final=True): @property def inverse_transform(self): - """Apply inverse transformations in reverse order + """Apply inverse transformations in reverse order. Note that resamplers + are skipped. - All estimators in the pipeline must support ``inverse_transform``. + All estimators in the pipeline, except resamplers, must support + `inverse_transform`. Parameters ---------- @@ -540,9 +548,7 @@ def inverse_transform(self): """ # raise AttributeError if necessary for hasattr behaviour # XXX: Handling the None case means we can't use if_delegate_has_method - for _, _, transform in self._iter(): - if hasattr(transform, "fit_resample"): - continue + for _, _, transform in self._iter(with_resamplers=False): transform.inverse_transform return self._inverse_transform @@ -555,7 +561,8 @@ def _inverse_transform(self, X): @if_delegate_has_method(delegate='_final_estimator') def score(self, X, y=None, sample_weight=None): - """Apply transforms, and score with the final estimator + """Apply transforms, and score with the final estimator. Note that + resamplers are not applied. Parameters ---------- @@ -628,14 +635,14 @@ def make_pipeline(*steps, **kwargs): *steps : list of estimators. memory : None, str or object with the joblib.Memory interface, optional - Used to cache the fitted transformers of the pipeline. By default, - no caching is performed. If a string is given, it is the path to - the caching directory. Enabling caching triggers a clone of - the transformers before fitting. Therefore, the transformer - instance given to the pipeline cannot be inspected - directly. Use the attribute ``named_steps`` or ``steps`` to - inspect estimators within the pipeline. Caching the - transformers is advantageous when fitting is time consuming. + Used to cache the fitted transformers of the pipeline. By default, no + caching is performed. If a string is given, it is the path to the + caching directory. Enabling caching triggers a clone of the + transformers/resamplers before fitting. Therefore, the estimator + instance given to the pipeline cannot be inspected directly. Use the + attribute `named_steps` or `steps` to inspect estimators within the + pipeline. Caching the estimators is advantageous when fitting is time + consuming. See also -------- diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index ea2adf1ccc743..92dbb8a0b30ef 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2499,14 +2499,6 @@ def check_resampler_structure(name, estimator_orig): X, y = make_blobs(n_samples=10) X_new, y_new = estimator_orig.fit_resample(X, y) - props = { - 'weight': np.arange(10), - 'other': np.arange(10), - 'more': np.arange(10) - } - X_new, y_new, props_new = estimator_orig.fit_resample(X, y, props) - assert props.key() == props_new.keys() - def check_resample_repeated(name, estimator_orig): X, y = make_blobs(n_samples=10) From 8339128d5beb6a3472bb45852f14bb44766d88fd Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 08:30:34 +0100 Subject: [PATCH 12/46] OVerall fixes --- sklearn/pipeline.py | 9 ++- sklearn/tests/test_pipeline.py | 109 +++++++++++++++++---------------- 2 files changed, 61 insertions(+), 57 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 5b3d4260e3472..13a94565b3db4 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -197,10 +197,9 @@ def _iter(self, with_final=True, with_resamplers=True): stop -= 1 for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)): - if trans is not None and ((trans != 'passthrough' and not - hasattr(trans, 'fit_resample')) or - (with_resamplers and - hasattr(trans, 'fit_resample'))): + if trans is not None and trans != 'passthrough' \ + and (not hasattr(trans, 'fit_resample') or \ + (with_resamplers and hasattr(trans, 'fit_resample'))): yield idx, name, trans @property @@ -341,7 +340,7 @@ def fit_transform(self, X, y=None, **fit_params): return last_step.fit(Xt, yt, **fit_params).transform(Xt) def fit_resample(self, X, y=None, **fit_params): - """Fit the model and sample with the final estimator + """Fit the model and resample with the final estimator Fits all the transformers/resamplers one after the other and transforms/resamples the data, then uses fit_resample on the diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 88fe349edd9d0..91ba8c256069f 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -19,7 +19,6 @@ from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_dict_equal from sklearn.utils.testing import assert_no_warnings @@ -701,6 +700,14 @@ def make(): assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) + # TODO need to investigate this + # exp = 2 * 3 + # pipeline = Pipeline( + # [('m2', mult2), ('m3', mult3), ('last', passthrough)]) + # assert_array_equal([[exp]], pipeline.fit_transform(X, y)) + # assert_array_equal([exp], pipeline.fit(X).predict(X)) + # assert_array_equal(X, pipeline.inverse_transform([[exp]])) + def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) @@ -1216,8 +1223,8 @@ def test_pipeline_resample(): # test transform and fit_transform: X_trans, y_trans = pipeline.fit_resample(X, y) X_trans2, y_trans2 = resampler.fit_resample(X, y) - assert_allclose(X_trans, X_trans2, rtol=R_TOL) - assert_allclose(y_trans, y_trans2, rtol=R_TOL) + assert_array_equal(X_trans, X_trans2) + assert_array_equal(y_trans, y_trans2) pca = PCA() pipeline = Pipeline([('pca', PCA()), ('resampler', resampler)]) @@ -1225,11 +1232,12 @@ def test_pipeline_resample(): X_trans, y_trans = pipeline.fit_resample(X, y) X_pca = pca.fit_transform(X) X_trans2, y_trans2 = resampler.fit_resample(X_pca, y) - assert_allclose(X_trans, X_trans2, rtol=R_TOL) - assert_allclose(y_trans, y_trans2, rtol=R_TOL) + assert_array_equal(X_trans, X_trans2) + assert_array_equal(y_trans, y_trans2) -def test_pipeline_none_classifier(): +@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +def test_pipeline_none_classifier(passthrough): # Test pipeline using None as preprocessing step and a classifier X, y = make_classification( n_classes=2, @@ -1243,7 +1251,7 @@ def test_pipeline_none_classifier(): n_samples=500, random_state=0) clf = LogisticRegression(solver='lbfgs', random_state=0) - pipe = make_pipeline(None, clf) + pipe = make_pipeline(passthrough, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -1251,7 +1259,8 @@ def test_pipeline_none_classifier(): pipe.score(X, y) -def test_pipeline_none_resampler_classifier(): +@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +def test_pipeline_none_resampler_classifier(passthrough): # Test pipeline using None, an OutlierResampler and a classifier X, y = make_classification( n_classes=2, @@ -1266,7 +1275,7 @@ def test_pipeline_none_resampler_classifier(): random_state=0) clf = LogisticRegression(solver='lbfgs', random_state=0) outlier = OneClassSVM(gamma='scale') - pipe = make_pipeline(None, outlier, clf) + pipe = make_pipeline(passthrough, outlier, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -1274,7 +1283,8 @@ def test_pipeline_none_resampler_classifier(): pipe.score(X, y) -def test_pipeline_resampler_none_classifier(): +@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +def test_pipeline_resampler_none_classifier(passthrough): # Test pipeline using an OutlierResampler, None and a classifier X, y = make_classification( n_classes=2, @@ -1289,7 +1299,7 @@ def test_pipeline_resampler_none_classifier(): random_state=0) clf = LogisticRegression(solver='lbfgs', random_state=0) outlier = OneClassSVM(gamma='scale') - pipe = make_pipeline(outlier, None, clf) + pipe = make_pipeline(outlier, passthrough, clf) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -1297,7 +1307,8 @@ def test_pipeline_resampler_none_classifier(): pipe.score(X, y) -def test_pipeline_none_resampler_resample(): +@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +def test_pipeline_none_resampler_resample(passthrough): # Test pipeline using None step and a resampler X, y = make_classification( n_classes=2, @@ -1312,11 +1323,12 @@ def test_pipeline_none_resampler_resample(): random_state=0) outlier = OneClassSVM(gamma='scale') - pipe = make_pipeline(None, outlier) + pipe = make_pipeline(passthrough, outlier) pipe.fit_resample(X, y) -def test_pipeline_none_transformer(): +@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +def test_pipeline_none_transformer(passthrough): # Test pipeline using None and a transformer that implements transform and # inverse_transform X, y = make_classification( @@ -1332,7 +1344,7 @@ def test_pipeline_none_transformer(): random_state=0) pca = PCA(whiten=True) - pipe = make_pipeline(None, pca) + pipe = make_pipeline(passthrough, pca) pipe.fit(X, y) X_trans = pipe.transform(X) X_inversed = pipe.inverse_transform(X_trans) @@ -1352,7 +1364,7 @@ def test_pipeline_methods_anova_outlier(): n_clusters_per_class=1, n_samples=500, random_state=0) - # Test with RandomUnderSampling + Anova + LogisticRegression + # Test with outlierdetection + Anova + LogisticRegression clf = LogisticRegression(solver='lbfgs') outlier = OneClassSVM(gamma='scale') filter1 = SelectKBest(f_classif, k=2) @@ -1381,7 +1393,7 @@ def test_pipeline_with_step_that_implements_both_sample_and_transform(): random_state=0) clf = LogisticRegression(solver='lbfgs') - with raises(TypeError): + with raises(TypeError, match='should be estimators that implement'): Pipeline([('step', FitTransformResample()), ('logistic', clf)]) @@ -1449,44 +1461,29 @@ def test_make_pipeline_memory(): shutil.rmtree(cachedir) -def test_shape_correct_after_resample(): - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=50, - random_state=0) +@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +def test_outlier_shape_correct_after_resample(passthrough): + X, y = make_classification() outlier = OneClassSVM(gamma='scale') - pipe = make_pipeline(outlier, None) + pca = PCA() + pipe = make_pipeline(outlier, passthrough) + pipe2 = make_pipeline(outlier, pca) outliers = outlier.fit_predict(X, y) == -1 n_outliers = np.sum(outliers) assert n_outliers > 0 # we have some outliers in the dataset X_new, y_new = pipe.fit_resample(X, y) + X_new2 = pipe2.fit_transform(X, y) assert X_new.shape[0] == X.shape[0] - n_outliers assert y_new.shape[0] == y.shape[0] - n_outliers + assert X_new2.shape[0] == X.shape[0] - n_outliers def test_resamplers_not_called(): - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=50, - random_state=0) + X, y = make_classification(n_samples=10) mul2 = Mult(2) dre = DummyResampler() @@ -1499,6 +1496,7 @@ def test_resamplers_not_called(): delattr(dre, "means_") pipe.predict(X) + pipe.score(X) assert not hasattr(dre, "means_") pipe.fit_transform(X, y) @@ -1510,17 +1508,7 @@ def test_resamplers_not_called(): def test_clusterer_and_resampler_error(): - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=10, - random_state=0) + X, y = make_classification(n_samples=10) dre = DummyResampler() pipe = make_pipeline(dre, KMeans()) @@ -1528,3 +1516,20 @@ def test_clusterer_and_resampler_error(): with pytest.raises(NotImplementedError, match=msg): pipe.fit_predict(X, y) + +@pytest.mark.parametrize('passthrough', [None, 'passthrough']) +def test_pipe_exposes_resample_correctly(passthrough): + # this test will be handled by test_metaestimators later, it's just here + # now for simplicity + # TODO make this test pass (maybe something similar is also broken for + # fit_transform) + X, y = make_classification(n_samples=10) + + dre = DummyResampler() + mul3 = Mult(3) + pipe = make_pipeline(dre, mul3) + pipe2 = make_pipeline(dre, passthrough) + + #pipe2 should have fit_resample, pipe shouldn't + assert not hasattr(pipe, 'fit_resample') + assert hasattr(pipe2, 'fit_resample') From d9ba54fd25689abfe018a1803548a1d02616d6cc Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 10:07:52 +0100 Subject: [PATCH 13/46] Rename OutlierResampler -> OutlierRejection --- sklearn/base.py | 2 +- sklearn/covariance/elliptic_envelope.py | 4 ++-- sklearn/ensemble/iforest.py | 4 ++-- sklearn/neighbors/lof.py | 4 ++-- sklearn/svm/classes.py | 4 ++-- sklearn/tests/test_pipeline.py | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index d8f863d9f7ac8..a588988421a41 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -544,7 +544,7 @@ def fit_predict(self, X, y=None): return self.fit(X).predict(X) -class OutlierResamplerMixin: +class OutlierRejectionMixin: """Mixin class for all outlier detection resamplers in scikit-learn. Child classes remove outliers from the passed samples. """ diff --git a/sklearn/covariance/elliptic_envelope.py b/sklearn/covariance/elliptic_envelope.py index 8f94e61c2f4da..c683ae93fc873 100644 --- a/sklearn/covariance/elliptic_envelope.py +++ b/sklearn/covariance/elliptic_envelope.py @@ -7,10 +7,10 @@ from . import MinCovDet from ..utils.validation import check_is_fitted, check_array from ..metrics import accuracy_score -from ..base import OutlierMixin, OutlierResamplerMixin +from ..base import OutlierMixin, OutlierRejectionMixin -class EllipticEnvelope(MinCovDet, OutlierMixin, OutlierResamplerMixin): +class EllipticEnvelope(MinCovDet, OutlierMixin, OutlierRejectionMixin): """An object for detecting outliers in a Gaussian distributed dataset. Read more in the :ref:`User Guide `. diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index d6938f1372753..97e732450c426 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -12,7 +12,7 @@ from ..utils import check_random_state, check_array from ..utils.fixes import _joblib_parallel_args from ..utils.validation import check_is_fitted -from ..base import OutlierMixin, OutlierResamplerMixin +from ..base import OutlierMixin, OutlierRejectionMixin from .bagging import BaseBagging @@ -21,7 +21,7 @@ INTEGER_TYPES = (numbers.Integral, np.integer) -class IsolationForest(BaseBagging, OutlierMixin, OutlierResamplerMixin): +class IsolationForest(BaseBagging, OutlierMixin, OutlierRejectionMixin): """Isolation Forest Algorithm Return the anomaly score of each sample using the IsolationForest algorithm diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index b0edd9a718176..3c5a6b6946bb0 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -8,7 +8,7 @@ from .base import NeighborsBase from .base import KNeighborsMixin from .base import UnsupervisedMixin -from ..base import OutlierMixin, OutlierResamplerMixin +from ..base import OutlierMixin, OutlierRejectionMixin from ..utils.validation import check_is_fitted from ..utils import check_array @@ -17,7 +17,7 @@ class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin, - OutlierMixin, OutlierResamplerMixin): + OutlierMixin, OutlierRejectionMixin): """Unsupervised Outlier Detection using Local Outlier Factor (LOF) The anomaly score of each sample is called Local Outlier Factor. diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py index 1024da5a4d7dd..0be0dd3e01f71 100644 --- a/sklearn/svm/classes.py +++ b/sklearn/svm/classes.py @@ -3,7 +3,7 @@ from .base import _fit_liblinear, BaseSVC, BaseLibSVM from ..base import (BaseEstimator, RegressorMixin, OutlierMixin, - OutlierResamplerMixin) + OutlierRejectionMixin) from ..linear_model.base import LinearClassifierMixin, SparseCoefMixin, \ LinearModel from ..utils import check_X_y @@ -1040,7 +1040,7 @@ def __init__(self, nu=0.5, C=1.0, kernel='rbf', degree=3, verbose=verbose, max_iter=max_iter, random_state=None) -class OneClassSVM(BaseLibSVM, OutlierMixin, OutlierResamplerMixin): +class OneClassSVM(BaseLibSVM, OutlierMixin, OutlierRejectionMixin): """Unsupervised Outlier Detection. Estimate the support of a high-dimensional distribution. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 91ba8c256069f..042a457ad249e 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1261,7 +1261,7 @@ def test_pipeline_none_classifier(passthrough): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_none_resampler_classifier(passthrough): - # Test pipeline using None, an OutlierResampler and a classifier + # Test pipeline using None, an outlier rejector and a classifier X, y = make_classification( n_classes=2, class_sep=2, @@ -1285,7 +1285,7 @@ def test_pipeline_none_resampler_classifier(passthrough): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_resampler_none_classifier(passthrough): - # Test pipeline using an OutlierResampler, None and a classifier + # Test pipeline using an outlier rejector, None and a classifier X, y = make_classification( n_classes=2, class_sep=2, From 75f74fb9aa5a9d5b62bcc66d22df16cf202a48e3 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 10:36:00 +0100 Subject: [PATCH 14/46] Fix failing LOF test --- sklearn/neighbors/lof.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index 3c5a6b6946bb0..83cb5f4fcbacf 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -190,6 +190,40 @@ def fit_predict(self): return self._fit_predict + @property + def fit_resample(self): + """Performs fit on X and returns new X and y consisting of only the + inliers. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Input data X. + + y : ndarray, shape (n_samples,) + Input data y. + + Returns + ------- + X : ndarray, shape (n_samples, n_features) + The input X with outlier samples removed. + + y : ndarray, shape (n_samples,) + The input y with outlier samples removed. + """ + # fit_resample requires fit_predict + if self.novelty: + msg = ('fit_resample is not available when novelty=True. Use ' + 'novelty=False if you want to use outlier rejection') + raise AttributeError(msg) + + return self._fit_resample + + def _fit_resample(self, X, y=None): + # XXX this is not very clean, is there a better way? + inliers = self.fit_predict(X) == 1 + + return X[inliers], y[inliers] def _fit_predict(self, X, y=None): """"Fits the model to the training set X and returns the labels. From 4d4b94654c79c19006aaf714b4dc48f82f895836 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 11:00:46 +0100 Subject: [PATCH 15/46] Address review --- sklearn/base.py | 10 +++++----- sklearn/neighbors/lof.py | 1 + sklearn/tests/test_pipeline.py | 10 +--------- sklearn/utils/estimator_checks.py | 15 +++++++++++---- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index a588988421a41..3c818d80e4f04 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -519,7 +519,7 @@ def score(self, X, y=None): class OutlierMixin: - """Mixin class for all outlier detection estimators in scikit-learn.""" + """Mixin class for all outlier rejection estimators in scikit-learn.""" _estimator_type = "outlier_detector" def fit_predict(self, X, y=None): @@ -546,12 +546,12 @@ def fit_predict(self, X, y=None): class OutlierRejectionMixin: """Mixin class for all outlier detection resamplers in scikit-learn. Child - classes remove outliers from the passed samples. + classes remove outliers from the dataset. """ _estimator_type = "outlier_resampler" def fit_resample(self, X, y): - """Performs fit on X and returns new X and y consisting of only the + """Performs fit on X and returns a new X and y consisting of only the inliers. Parameters @@ -565,10 +565,10 @@ def fit_resample(self, X, y): Returns ------- X : ndarray, shape (n_samples, n_features) - The input X with outlier samples removed. + The original X with outlier samples removed. y : ndarray, shape (n_samples,) - The input y with outlier samples removed. + The original y with outlier samples removed. """ inliers = self.fit_predict(X) == 1 diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index 83cb5f4fcbacf..cb1a92151b4b3 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -224,6 +224,7 @@ def _fit_resample(self, X, y=None): inliers = self.fit_predict(X) == 1 return X[inliers], y[inliers] + def _fit_predict(self, X, y=None): """"Fits the model to the training set X and returns the labels. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 042a457ad249e..dfd9bc2688fa9 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -700,14 +700,6 @@ def make(): assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) - # TODO need to investigate this - # exp = 2 * 3 - # pipeline = Pipeline( - # [('m2', mult2), ('m3', mult3), ('last', passthrough)]) - # assert_array_equal([[exp]], pipeline.fit_transform(X, y)) - # assert_array_equal([exp], pipeline.fit(X).predict(X)) - # assert_array_equal(X, pipeline.inverse_transform([[exp]])) - def test_pipeline_ducktyping(): pipeline = make_pipeline(Mult(5)) @@ -1517,6 +1509,7 @@ def test_clusterer_and_resampler_error(): match=msg): pipe.fit_predict(X, y) + @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipe_exposes_resample_correctly(passthrough): # this test will be handled by test_metaestimators later, it's just here @@ -1530,6 +1523,5 @@ def test_pipe_exposes_resample_correctly(passthrough): pipe = make_pipeline(dre, mul3) pipe2 = make_pipeline(dre, passthrough) - #pipe2 should have fit_resample, pipe shouldn't assert not hasattr(pipe, 'fit_resample') assert hasattr(pipe2, 'fit_resample') diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 92dbb8a0b30ef..bf15e718280d2 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -56,7 +56,8 @@ from sklearn.utils import shuffle from sklearn.utils.validation import has_fit_parameter, _num_samples from sklearn.preprocessing import StandardScaler -from sklearn.datasets import load_iris, load_boston, make_blobs +from sklearn.datasets import load_iris, load_boston +from sklearn.datasets import make_blobs, make_classification BOSTON = None @@ -217,7 +218,7 @@ def _yield_outliers_checks(name, estimator): yield check_outliers_fit_predict if hasattr(estimator, 'fit_resample'): - yield check_outlier_resamplers + yield check_outlier_rejectors # checks for estimators that can be used on a test set if hasattr(estimator, 'predict'): @@ -2484,7 +2485,7 @@ def check_fit_idempotent(name, estimator_orig): assert_allclose_dense_sparse(result[method], new_result) -def check_outlier_resamplers(name, estimator_orig): +def check_outlier_rejectors(name, estimator_orig): X, y = make_blobs(random_state=0) outliers = estimator_orig.fit_predict(X, y) == -1 n_outliers = np.sum(outliers) @@ -2501,7 +2502,13 @@ def check_resampler_structure(name, estimator_orig): def check_resample_repeated(name, estimator_orig): - X, y = make_blobs(n_samples=10) + X, y = make_classification( + n_classes=2, + weights=[0.1, 0.9], + n_features=20, + n_clusters_per_class=1, + n_samples=50, + random_state=0) set_random_state(estimator_orig, random_state=0) X_new, y_new = estimator_orig.fit_resample(X, y) From f649300e516b73e99aa84922e483a6b445b40087 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 12:09:41 +0100 Subject: [PATCH 16/46] Glossary entries --- doc/glossary.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/glossary.rst b/doc/glossary.rst index 49e756773796d..2e39e23908507 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -905,6 +905,15 @@ Class APIs and Estimator Types outliers have score below 0. :term:`score_samples` may provide an unnormalized score per sample. + outlier rejector + outlier rejectors + An :term:`outlier detector` which is a resampler. It will remove + outliers from a passed dataset when :term:`fit_resample` is called. + + Outlier detectors must implement: + + * :term:`fit_resample` + predictor predictors An :term:`estimator` supporting :term:`predict` and/or @@ -1205,6 +1214,11 @@ Methods (i.e. training and test data together) before further modelling, as this results in :term:`data leakage`. + ``fit_resample`` + A method on :term:`resamplers` which fits the estimator on a passed + dataset, and returns a new dataset. In the new dataset, samples may be + removed or added. + ``get_feature_names`` Primarily for :term:`feature extractors`, but also used for other transformers to provide string names for each column in the output of From f12664a78f8af40cbf01093f5029ae1c5c781216 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 12:16:22 +0100 Subject: [PATCH 17/46] Changes --- doc/glossary.rst | 2 + doc/modules/classes.rst | 2 + sklearn/covariance/elliptic_envelope.py | 3 +- sklearn/ensemble/iforest.py | 3 +- sklearn/neighbors/lof.py | 3 +- sklearn/pipeline.py | 112 +++++++++++++----------- sklearn/tests/test_pipeline.py | 13 ++- 7 files changed, 75 insertions(+), 63 deletions(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index 2e39e23908507..afdd4c162fb1d 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -914,6 +914,8 @@ Class APIs and Estimator Types * :term:`fit_resample` + If the estimator implements `fit_predict + predictor predictors An :term:`estimator` supporting :term:`predict` and/or diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index dc6112d589a14..0c8f91531104f 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -30,6 +30,8 @@ Base classes base.BiclusterMixin base.ClassifierMixin base.ClusterMixin + base.OutlierMixin + base.OutlierRejectorMixin base.DensityMixin base.RegressorMixin base.TransformerMixin diff --git a/sklearn/covariance/elliptic_envelope.py b/sklearn/covariance/elliptic_envelope.py index c683ae93fc873..a0053ddff610b 100644 --- a/sklearn/covariance/elliptic_envelope.py +++ b/sklearn/covariance/elliptic_envelope.py @@ -7,7 +7,8 @@ from . import MinCovDet from ..utils.validation import check_is_fitted, check_array from ..metrics import accuracy_score -from ..base import OutlierMixin, OutlierRejectionMixin +from ..base import OutlierMixin +from ..base import OutlierRejectionMixin class EllipticEnvelope(MinCovDet, OutlierMixin, OutlierRejectionMixin): diff --git a/sklearn/ensemble/iforest.py b/sklearn/ensemble/iforest.py index 97e732450c426..770779baae588 100644 --- a/sklearn/ensemble/iforest.py +++ b/sklearn/ensemble/iforest.py @@ -12,7 +12,8 @@ from ..utils import check_random_state, check_array from ..utils.fixes import _joblib_parallel_args from ..utils.validation import check_is_fitted -from ..base import OutlierMixin, OutlierRejectionMixin +from ..base import OutlierMixin +from ..base import OutlierRejectionMixin from .bagging import BaseBagging diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index cb1a92151b4b3..c344cc4b891d0 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -8,7 +8,8 @@ from .base import NeighborsBase from .base import KNeighborsMixin from .base import UnsupervisedMixin -from ..base import OutlierMixin, OutlierRejectionMixin +from ..base import OutlierMixin +from ..base import OutlierRejectionMixin from ..utils.validation import check_is_fitted from ..utils import check_array diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 13a94565b3db4..96d8ee69033ed 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -157,6 +157,10 @@ def _validate_steps(self): # validate estimators transformers = estimators[:-1] estimator = estimators[-1] + + # if we have a estimator with fit_predict at the end of the pipeline, + # we cannot have resamplers in the pipeline. This is used to check + # that. self._resamplers_exist = False for t in transformers: @@ -164,21 +168,21 @@ def _validate_steps(self): continue if hasattr(t, "fit_resample"): self._resamplers_exist = True - if (not (hasattr(t, "fit") or - hasattr(t, "fit_transform") or - hasattr(t, "fit_resample")) or - not (hasattr(t, "transform") or - hasattr(t, "fit_resample"))): + is_transformer = ((hasattr(t, "fit") and hasattr(t, "transform")) + or hasattr(t, "fit_transform")) + + if not is_transformer and not hasattr(t, "fit_resample"): raise TypeError( "All intermediate steps of Pipeline should be " "estimators that implement fit and transform or resample " - "(but not both) '%s' (type %s) doesn't)" % (t, type(t))) - if (hasattr(t, "fit_resample") and (hasattr(t, "fit_transform") or - hasattr(t, "transform"))): + "(but not both) '%s' (type %s) doesn't." % (t, type(t))) + + if (hasattr(t, "fit_transform") or hasattr(t, "transform")) and \ + hasattr(t, 'fit_resample'): raise TypeError( "All intermediate steps of the chain should be estimators" " that implement fit and transform or fit_resample." - " '%s' implements both)" % (t)) + " '%s' implements both." % (t)) # We allow last estimator to be None as an identity transformation if (estimator is not None and estimator != 'passthrough' @@ -197,9 +201,10 @@ def _iter(self, with_final=True, with_resamplers=True): stop -= 1 for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)): - if trans is not None and trans != 'passthrough' \ - and (not hasattr(trans, 'fit_resample') or \ - (with_resamplers and hasattr(trans, 'fit_resample'))): + is_passthrough = trans is None or trans == 'passthrough' + if not is_passthrough and (not hasattr(trans, 'fit_resample') or + (with_resamplers and + hasattr(trans, 'fit_resample'))): yield idx, name, trans @property @@ -339,46 +344,6 @@ def fit_transform(self, X, y=None, **fit_params): else: return last_step.fit(Xt, yt, **fit_params).transform(Xt) - def fit_resample(self, X, y=None, **fit_params): - """Fit the model and resample with the final estimator - - Fits all the transformers/resamplers one after the other and - transforms/resamples the data, then uses fit_resample on the - transformed/resampled data with the final estimator. - - Parameters - ---------- - X : iterable - Training data. Must fulfill input requirements of first step of the - pipeline. - - y : iterable, default=None - Training targets. Must fulfill label requirements for all steps of - the pipeline. - - **fit_params : dict of string -> object - Parameters passed to the ``fit`` method of each step, where - each parameter name is prefixed such that parameter ``p`` for step - ``s`` has key ``s__p``. - - Returns - ------- - Xt : array-like, shape = [n_resampled_samples, n_transformed_features] - Transformed samples - - yt : array-like, shape = [n_resampled_samples, n_transformed_features] - Transformed target - - """ - last_step = self._final_estimator - Xt, yt, fit_params = self._fit(X, y, **fit_params) - if hasattr(last_step, 'fit_resample'): - return last_step.fit_resample(Xt, yt, **fit_params) - elif last_step == 'passthrough': - return Xt, yt - else: - return last_step.fit_resample(Xt, yt, **fit_params) - @if_delegate_has_method(delegate='_final_estimator') def predict(self, X, **predict_params): """Apply transforms to the data, and predict with the final estimator. @@ -495,6 +460,49 @@ def predict_log_proba(self, X): Xt = self._transform(X, with_final=False, with_resamplers=False) return self.steps[-1][-1].predict_log_proba(Xt) + @property + def fit_resample(self, X, y, **fit_params): + """Fit the model and resample with the final estimator + + Fits all the transformers/resamplers one after the other and + transforms/resamples the data, then uses fit_resample on the + transformed/resampled data with the final estimator. + + Parameters + ---------- + X : iterable + Training data. Must fulfill input requirements of first step of the + pipeline. + + y : iterable + Training targets. Must fulfill label requirements for all steps of + the pipeline. + + **fit_params : dict of string -> object + Parameters passed to the ``fit`` method of each step, where + each parameter name is prefixed such that parameter ``p`` for step + ``s`` has key ``s__p``. + + Returns + ------- + Xt : array-like, shape = [n_resampled_samples, n_transformed_features] + Transformed samples + + yt : array-like, shape = [n_resampled_samples, n_transformed_features] + Transformed target + + """ + if self._final_estimator != 'passthrough': + self._final_estimator.fit_resample + return self._fit_resample + + def _fit_resample(self, X, y, **fit_params): + last_step = self._final_estimator + Xt, yt, fit_params = self._fit(X, y, **fit_params) + if last_step == 'passthrough': + return Xt, yt + return last_step.fit_resample(Xt, yt, **fit_params) + @property def transform(self): """Apply transforms/resamples, and transform with the final estimator. diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index dfd9bc2688fa9..6baf4c0d6a7c8 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -2,15 +2,12 @@ Test the pipeline module. """ -# lots of resample tests were taken from imblearn - from distutils.version import LooseVersion from tempfile import mkdtemp import shutil import time import pytest -from pytest import raises import numpy as np from scipy import sparse @@ -164,7 +161,7 @@ def predict(self, X, got_attribute=False): class DummyResampler(NoTrans): - """Resampler which returns the same samples""" + """Resampler which returns the same samples.""" def fit_resample(self, X, y): self.means_ = np.mean(X, axis=0) @@ -175,8 +172,7 @@ def fit_resample(self, X, y): class FitTransformResample(NoTrans): - """Estimator implementing both transform and sample - """ + """Estimator implementing both transform and fit_resample.""" def fit(self, X, y, should_succeed=False): pass @@ -1073,6 +1069,7 @@ def test_pipeline_memory(): def test_pipeline_memory_resampler(): + # TODO X, y = make_classification( n_classes=2, class_sep=2, @@ -1385,7 +1382,7 @@ def test_pipeline_with_step_that_implements_both_sample_and_transform(): random_state=0) clf = LogisticRegression(solver='lbfgs') - with raises(TypeError, match='should be estimators that implement'): + with pytest.raises(TypeError, match='should be estimators that implement'): Pipeline([('step', FitTransformResample()), ('logistic', clf)]) @@ -1515,7 +1512,7 @@ def test_pipe_exposes_resample_correctly(passthrough): # this test will be handled by test_metaestimators later, it's just here # now for simplicity # TODO make this test pass (maybe something similar is also broken for - # fit_transform) + # fit_transform X, y = make_classification(n_samples=10) dre = DummyResampler() From 21769e59b7a88521cb080ba9692090c651163fa6 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 12:17:31 +0100 Subject: [PATCH 18/46] Pack params --- sklearn/tests/test_pipeline.py | 195 +++++++++------------------------ 1 file changed, 52 insertions(+), 143 deletions(-) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 6baf4c0d6a7c8..e23af1d81f7a9 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1070,17 +1070,10 @@ def test_pipeline_memory(): def test_pipeline_memory_resampler(): # TODO - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) @@ -1141,17 +1134,10 @@ def test_pipeline_memory_resampler(): def test_pipeline_methods_pca_outlier_svm(): # Test the various methods of the pipeline (pca + svm). - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) @@ -1167,17 +1153,10 @@ def test_pipeline_methods_pca_outlier_svm(): def test_pipeline_methods_outlier_pca_svm(): # Test the various methods of the pipeline (pca + svm). - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) # Test with PCA + SVC clf = SVC(gamma='scale', probability=True, random_state=0) @@ -1194,17 +1173,10 @@ def test_pipeline_methods_outlier_pca_svm(): def test_pipeline_resample(): # Test whether pipeline works with a resampler at the end. # Also test pipeline.fit_resample - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) resampler = OneClassSVM(gamma='scale') pipeline = Pipeline([('resampler', resampler)]) @@ -1228,17 +1200,10 @@ def test_pipeline_resample(): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_none_classifier(passthrough): # Test pipeline using None as preprocessing step and a classifier - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) clf = LogisticRegression(solver='lbfgs', random_state=0) pipe = make_pipeline(passthrough, clf) pipe.fit(X, y) @@ -1251,17 +1216,10 @@ def test_pipeline_none_classifier(passthrough): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_none_resampler_classifier(passthrough): # Test pipeline using None, an outlier rejector and a classifier - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) clf = LogisticRegression(solver='lbfgs', random_state=0) outlier = OneClassSVM(gamma='scale') pipe = make_pipeline(passthrough, outlier, clf) @@ -1275,17 +1233,10 @@ def test_pipeline_none_resampler_classifier(passthrough): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_resampler_none_classifier(passthrough): # Test pipeline using an outlier rejector, None and a classifier - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) clf = LogisticRegression(solver='lbfgs', random_state=0) outlier = OneClassSVM(gamma='scale') pipe = make_pipeline(outlier, passthrough, clf) @@ -1299,17 +1250,10 @@ def test_pipeline_resampler_none_classifier(passthrough): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_none_resampler_resample(passthrough): # Test pipeline using None step and a resampler - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) outlier = OneClassSVM(gamma='scale') pipe = make_pipeline(passthrough, outlier) @@ -1320,17 +1264,10 @@ def test_pipeline_none_resampler_resample(passthrough): def test_pipeline_none_transformer(passthrough): # Test pipeline using None and a transformer that implements transform and # inverse_transform - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) pca = PCA(whiten=True) pipe = make_pipeline(passthrough, pca) @@ -1342,17 +1279,10 @@ def test_pipeline_none_transformer(passthrough): def test_pipeline_methods_anova_outlier(): # Test the various methods of the pipeline (anova). - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) # Test with outlierdetection + Anova + LogisticRegression clf = LogisticRegression(solver='lbfgs') outlier = OneClassSVM(gamma='scale') @@ -1369,17 +1299,10 @@ def test_pipeline_methods_anova_outlier(): def test_pipeline_with_step_that_implements_both_sample_and_transform(): # Test the various methods of the pipeline (anova). - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) clf = LogisticRegression(solver='lbfgs') with pytest.raises(TypeError, match='should be estimators that implement'): @@ -1387,17 +1310,10 @@ def test_pipeline_with_step_that_implements_both_sample_and_transform(): def test_pipeline_fit_then_sample_with_resampler_last_estimator(): - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) outlier1 = OneClassSVM(gamma='scale') outlier2 = LocalOutlierFactor(contamination=0.1) @@ -1412,17 +1328,10 @@ def test_pipeline_fit_then_sample_with_resampler_last_estimator(): def test_pipeline_fit_then_sample_3_resamplers_with_resampler_last_estimator(): - X, y = make_classification( - n_classes=2, - class_sep=2, - weights=[0.1, 0.9], - n_informative=3, - n_redundant=1, - flip_y=0, - n_features=20, - n_clusters_per_class=1, - n_samples=500, - random_state=0) + X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) outlier1 = OneClassSVM(gamma='scale') outlier2 = LocalOutlierFactor(contamination=0.1) From fbb2b36b4d7f97c661a92749cb1b1aaf2d04c470 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 12:35:03 +0100 Subject: [PATCH 19/46] Fix failing test --- doc/glossary.rst | 8 +++++++- sklearn/pipeline.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index afdd4c162fb1d..5921d3211a15d 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -914,7 +914,9 @@ Class APIs and Estimator Types * :term:`fit_resample` - If the estimator implements `fit_predict + If the estimator implements :term:`fit_predict` according to the + :class:`OutlierMixin` API, :class:`OutlierRejectorMixin` should be used + to automatically implement correct :term:`fit_resample` behavior. predictor predictors @@ -947,6 +949,10 @@ Class APIs and Estimator Types A purely :term:`transductive` transformer, such as :class:`manifold.TSNE`, may not implement ``transform``. + resampler + resamplers + An estimator supporting :term:`fit_resample`. + vectorizer vectorizers See :term:`feature extractor`. diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 96d8ee69033ed..430345ef6a5c8 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -461,7 +461,7 @@ def predict_log_proba(self, X): return self.steps[-1][-1].predict_log_proba(Xt) @property - def fit_resample(self, X, y, **fit_params): + def fit_resample(self): """Fit the model and resample with the final estimator Fits all the transformers/resamplers one after the other and From d04d7c7b95513702441acc8a093ba7ff87749e41 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 1 Mar 2019 14:23:53 +0100 Subject: [PATCH 20/46] Docs --- doc/developers/contributing.rst | 9 ++++- doc/modules/compose.rst | 66 +++++++++++++++++++++++++++++-- doc/modules/outlier_detection.rst | 10 +++++ sklearn/base.py | 5 ++- 4 files changed, 83 insertions(+), 7 deletions(-) diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst index 1f37ff0929ff9..d30a5e6270fe6 100644 --- a/doc/developers/contributing.rst +++ b/doc/developers/contributing.rst @@ -1061,7 +1061,7 @@ multiple interfaces): :Transformer: - For filtering or modifying the data, in a supervised or unsupervised + For modifying the data, in a supervised or unsupervised way, implements:: new_data = transformer.transform(data) @@ -1071,6 +1071,13 @@ multiple interfaces): new_data = transformer.fit_transform(data) +:Resamplers: + + For filtering or augmenting the data, in a supervised or unsupervised + way, implements:: + + new_X, new_y = transformer.fit_resample(data_X, data_y) + :Model: A model that can give a `goodness of fit `_ diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 0145842b88e16..a900bb19b2bd4 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -5,14 +5,16 @@ Pipelines and composite estimators ================================== -Transformers are usually combined with classifiers, regressors or other +Transformers and resamplers are usually combined with classifiers, regressors +or other estimators to build a composite estimator. The most common tool is a :ref:`Pipeline `. Pipeline is often used in combination with :ref:`FeatureUnion ` which concatenates the output of transformers into a composite feature space. :ref:`TransformedTargetRegressor ` deals with transforming the :term:`target` (i.e. log-transform :term:`y`). In contrast, Pipelines only transform the -observed data (:term:`X`). +observed data (:term:`X`). Additionally, pipelines support :term:`resamplers` to +resample the dataset on fit (see :ref:`_pipeline_resamplers`). .. _pipeline: @@ -139,6 +141,63 @@ i.e. if the last estimator is a classifier, the :class:`Pipeline` can be used as a classifier. If the last estimator is a transformer, again, so is the pipeline. +.. _pipeline_resamplers: + +Resamplers in pipelines +----------------------- +In addition to transformers, pipelines also support :ref:`resamplers` as +intermediate steps. However, unlike for transformers, pipelines do not always +call resamplers when data flows through the pipeline. In summary: + +===================== ================================ +Method Resamplers applied +===================== ================================ +``fit`` Yes +``fit_transform`` Yes +``transform`` Yes +``predict`` No +``score`` No +``fit_predict`` not supported (see note) +===================== ================================ + Note) + +To understand why, consider the example of :ref:`outlier rejectors`. These +resamplers will remove samples from the dataset if they classified as outliers. +Consider the following pipeline:: + + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.covariance import EllipticEnvelope + >>> from sklearn.linear_model import LogisticRegression + >>> pipe = make_pipeline(EllipticEnvelope(), LogisticRegression()) # doctest: +NORMALIZE_WHITESPACE + >>> pipe.fit(X_train, y_train) + +In ``pipe``, we would remove outliers before fitting our `LogisticRegression` +model, so that the samples passed to fit come from the same distribution. We do +this to improve the quality of the fit (see :ref:`_outlier_detection`). +Therefore, during ``fit``, we want our resampler to be applied. + +Now assume that we would like to make predictions on some new data ``X_test``:: + + >>> predictions = pipe.predict(X_test) + +If we applied our resampler, it would remove outliers from ``X_test``. This is +nonsensical for two reasons: +1. We would not get predictions for samples that were classified as outliers. +2. Since resamplers are always fitted on the data they will predict on, the + notion of an outlier in the ``X_test`` is not consistent with the notion of + an outlier in ``X_train``. A sample could be an outlier in ``X_train``, but + an inlier in ``X_test``, depending on the other samples passed. + +Therefore, all methods in which predictions are made will skip resamplers in +the pipeline. + +.. note:: + + If a pipeline contains resamplers, you may not call :term:`fit_predict` on + it. For reasons described above, it is not practical to apply resamplers + when predictions are being made. Therefore, we would not be able to apply + the resamplers in such a call. + .. _pipeline_cache: Caching transformers: avoid repeated computation @@ -311,8 +370,7 @@ is fit to the data independently. The transformers are applied in parallel, and the feature matrices they output are concatenated side-by-side into a larger matrix. -When you want to apply different transformations to each field of the data, -see the related class :class:`sklearn.compose.ColumnTransformer` +When you want to apply different transformations to each field of the data, see the related class :class:`sklearn.compose.ColumnTransformer` (see :ref:`user guide `). :class:`FeatureUnion` serves the same purposes as :class:`Pipeline` - diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index 69f7a275ab018..16b1de05127e8 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -330,6 +330,16 @@ This strategy is illustrated below. `_ Proc. ACM SIGMOD +.. _outlier_rejectors + +Outlier Rejectors +----------------- +All :term:`outlier detectors` can be used as :term:`outlier rejectors`, a form +of :term:`resampler` that takes in a dataset, and returns a new dataset that is +the same dataset, but with the outliers removed. This is especially useful for +pipelines. See :ref:`pipeline_resamplers` and the examples. + +.. topic:: Examples: .. _novelty_with_lof: Novelty detection with Local Outlier Factor diff --git a/sklearn/base.py b/sklearn/base.py index 3c818d80e4f04..07d74aedc8c9d 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -13,6 +13,7 @@ from . import __version__ from sklearn.utils import _IS_32BIT +from sklearn.utils import safe_indexing _DEFAULT_TAGS = { 'non_deterministic': False, @@ -548,7 +549,7 @@ class OutlierRejectionMixin: """Mixin class for all outlier detection resamplers in scikit-learn. Child classes remove outliers from the dataset. """ - _estimator_type = "outlier_resampler" + _estimator_type = "outlier_rejector" def fit_resample(self, X, y): """Performs fit on X and returns a new X and y consisting of only the @@ -571,7 +572,7 @@ def fit_resample(self, X, y): The original y with outlier samples removed. """ - inliers = self.fit_predict(X) == 1 + inliers = safe_mask(X, self.fit_predict(X) == 1) return X[inliers], y[inliers] From ec2f48ad9fcc7a2124a0a4db238f842abbda7453 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Sat, 2 Mar 2019 17:06:15 +0100 Subject: [PATCH 21/46] Add missing import --- sklearn/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index 07d74aedc8c9d..ef5e54200b49b 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -13,7 +13,7 @@ from . import __version__ from sklearn.utils import _IS_32BIT -from sklearn.utils import safe_indexing +from sklearn.utils import safe_indexing, safe_mask _DEFAULT_TAGS = { 'non_deterministic': False, From 61fce479352f82b7f3d1136aac24d5598748ec73 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Fri, 8 Mar 2019 11:17:03 +0100 Subject: [PATCH 22/46] pep8 and rejector -> detector --- sklearn/base.py | 4 ++-- sklearn/tests/test_pipeline.py | 26 +++++++++++++------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index ef5e54200b49b..e967d55b5abe2 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -13,7 +13,7 @@ from . import __version__ from sklearn.utils import _IS_32BIT -from sklearn.utils import safe_indexing, safe_mask +from sklearn.utils import safe_mask _DEFAULT_TAGS = { 'non_deterministic': False, @@ -520,7 +520,7 @@ def score(self, X, y=None): class OutlierMixin: - """Mixin class for all outlier rejection estimators in scikit-learn.""" + """Mixin class for all outlier detection estimators in scikit-learn.""" _estimator_type = "outlier_detector" def fit_predict(self, X, y=None): diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index e23af1d81f7a9..6e2b156c21d2a 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1070,7 +1070,7 @@ def test_pipeline_memory(): def test_pipeline_memory_resampler(): # TODO - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1134,7 +1134,7 @@ def test_pipeline_memory_resampler(): def test_pipeline_methods_pca_outlier_svm(): # Test the various methods of the pipeline (pca + svm). - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1153,7 +1153,7 @@ def test_pipeline_methods_pca_outlier_svm(): def test_pipeline_methods_outlier_pca_svm(): # Test the various methods of the pipeline (pca + svm). - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1173,7 +1173,7 @@ def test_pipeline_methods_outlier_pca_svm(): def test_pipeline_resample(): # Test whether pipeline works with a resampler at the end. # Also test pipeline.fit_resample - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1200,7 +1200,7 @@ def test_pipeline_resample(): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_none_classifier(passthrough): # Test pipeline using None as preprocessing step and a classifier - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1216,7 +1216,7 @@ def test_pipeline_none_classifier(passthrough): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_none_resampler_classifier(passthrough): # Test pipeline using None, an outlier rejector and a classifier - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1233,7 +1233,7 @@ def test_pipeline_none_resampler_classifier(passthrough): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_resampler_none_classifier(passthrough): # Test pipeline using an outlier rejector, None and a classifier - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1250,7 +1250,7 @@ def test_pipeline_resampler_none_classifier(passthrough): @pytest.mark.parametrize('passthrough', [None, 'passthrough']) def test_pipeline_none_resampler_resample(passthrough): # Test pipeline using None step and a resampler - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1264,7 +1264,7 @@ def test_pipeline_none_resampler_resample(passthrough): def test_pipeline_none_transformer(passthrough): # Test pipeline using None and a transformer that implements transform and # inverse_transform - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1279,7 +1279,7 @@ def test_pipeline_none_transformer(passthrough): def test_pipeline_methods_anova_outlier(): # Test the various methods of the pipeline (anova). - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1299,7 +1299,7 @@ def test_pipeline_methods_anova_outlier(): def test_pipeline_with_step_that_implements_both_sample_and_transform(): # Test the various methods of the pipeline (anova). - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1310,7 +1310,7 @@ def test_pipeline_with_step_that_implements_both_sample_and_transform(): def test_pipeline_fit_then_sample_with_resampler_last_estimator(): - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) @@ -1328,7 +1328,7 @@ def test_pipeline_fit_then_sample_with_resampler_last_estimator(): def test_pipeline_fit_then_sample_3_resamplers_with_resampler_last_estimator(): - X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=500, random_state=0) From de252ef46e98a886b9ec28d20dc0220699fb3e88 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Tue, 25 Jun 2019 23:41:57 +1000 Subject: [PATCH 23/46] Revert pipeline to master --- sklearn/pipeline.py | 483 ++++++++++++++++-------------- sklearn/tests/test_pipeline.py | 528 ++++++++------------------------- 2 files changed, 397 insertions(+), 614 deletions(-) diff --git a/sklearn/pipeline.py b/sklearn/pipeline.py index 430345ef6a5c8..c66e37761782d 100644 --- a/sklearn/pipeline.py +++ b/sklearn/pipeline.py @@ -8,17 +8,17 @@ # Alexandre Gramfort # Lars Buitinck # License: BSD + from collections import defaultdict from itertools import islice -from functools import partial import numpy as np from scipy import sparse +from joblib import Parallel, delayed from .base import clone, TransformerMixin -from .utils._joblib import Parallel, delayed from .utils.metaestimators import if_delegate_has_method -from .utils import Bunch +from .utils import Bunch, _print_elapsed_time from .utils.validation import check_memory from .utils.metaestimators import _BaseComposition @@ -30,10 +30,10 @@ class Pipeline(_BaseComposition): """Pipeline of transforms with a final estimator. Sequentially apply a list of transforms and a final estimator. - Intermediate steps of the pipeline must be transformers or resamplers, that - is, they must implement `fit` and `transform` methods, or a `fit_resample` - method. The final estimator only needs to implement `fit`. - The transformers in the pipeline can be cached using `memory` argument. + Intermediate steps of the pipeline must be 'transforms', that is, they + must implement fit and transform methods. + The final estimator only needs to implement fit. + The transformers in the pipeline can be cached using ``memory`` argument. The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters. @@ -41,7 +41,7 @@ class Pipeline(_BaseComposition): names and the parameter name separated by a '__', as in the example below. A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting - it to 'passthrough' or `None`. + it to 'passthrough' or ``None``. Read more in the :ref:`User Guide `. @@ -62,6 +62,10 @@ class Pipeline(_BaseComposition): inspect estimators within the pipeline. Caching the transformers is advantageous when fitting is time consuming. + verbose : boolean, optional + If True, the time elapsed while fitting each step will be printed as it + is completed. + Attributes ---------- named_steps : bunch object, a dictionary with attribute access @@ -91,34 +95,41 @@ class Pipeline(_BaseComposition): >>> # For instance, fit using a k of 10 in the SelectKBest >>> # and a parameter 'C' of the svm >>> anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y) - ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE - Pipeline(memory=None, - steps=[('anova', SelectKBest(...)), - ('svc', SVC(...))]) + Pipeline(steps=[('anova', SelectKBest(...)), ('svc', SVC(...))]) >>> prediction = anova_svm.predict(X) - >>> anova_svm.score(X, y) # doctest: +ELLIPSIS + >>> anova_svm.score(X, y) 0.83 >>> # getting the selected features chosen by anova_filter - >>> anova_svm.named_steps['anova'].get_support() - ... # doctest: +NORMALIZE_WHITESPACE - array([False, False, True, True, False, False, True, True, False, - True, False, True, True, False, True, False, True, True, + >>> anova_svm['anova'].get_support() + array([False, False, True, True, False, False, True, True, False, + True, False, True, True, False, True, False, True, True, False, False]) >>> # Another way to get selected features chosen by anova_filter >>> anova_svm.named_steps.anova.get_support() - ... # doctest: +NORMALIZE_WHITESPACE - array([False, False, True, True, False, False, True, True, False, - True, False, True, True, False, True, False, True, True, + array([False, False, True, True, False, False, True, True, False, + True, False, True, True, False, True, False, True, True, False, False]) + >>> # Indexing can also be used to extract a sub-pipeline. + >>> sub_pipeline = anova_svm[:1] + >>> sub_pipeline + Pipeline(steps=[('anova', SelectKBest(...))]) + >>> coef = anova_svm[-1].coef_ + >>> anova_svm['svc'] is anova_svm[-1] + True + >>> coef.shape + (1, 10) + >>> sub_pipeline.inverse_transform(coef).shape + (1, 20) """ # BaseEstimator interface _required_parameters = ['steps'] - def __init__(self, steps, memory=None): + def __init__(self, steps, memory=None, verbose=False): self.steps = steps self._validate_steps() self.memory = memory + self.verbose = verbose def get_params(self, deep=True): """Get parameters for this estimator. @@ -158,31 +169,15 @@ def _validate_steps(self): transformers = estimators[:-1] estimator = estimators[-1] - # if we have a estimator with fit_predict at the end of the pipeline, - # we cannot have resamplers in the pipeline. This is used to check - # that. - self._resamplers_exist = False - for t in transformers: if t is None or t == 'passthrough': continue - if hasattr(t, "fit_resample"): - self._resamplers_exist = True - is_transformer = ((hasattr(t, "fit") and hasattr(t, "transform")) - or hasattr(t, "fit_transform")) - - if not is_transformer and not hasattr(t, "fit_resample"): - raise TypeError( - "All intermediate steps of Pipeline should be " - "estimators that implement fit and transform or resample " - "(but not both) '%s' (type %s) doesn't." % (t, type(t))) - - if (hasattr(t, "fit_transform") or hasattr(t, "transform")) and \ - hasattr(t, 'fit_resample'): - raise TypeError( - "All intermediate steps of the chain should be estimators" - " that implement fit and transform or fit_resample." - " '%s' implements both." % (t)) + if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not + hasattr(t, "transform")): + raise TypeError("All intermediate steps should be " + "transformers and implement fit and transform " + "or be the string 'passthrough' " + "'%s' (type %s) doesn't" % (t, type(t))) # We allow last estimator to be None as an identity transformation if (estimator is not None and estimator != 'passthrough' @@ -192,20 +187,48 @@ def _validate_steps(self): "or be the string 'passthrough'. " "'%s' (type %s) doesn't" % (estimator, type(estimator))) - def _iter(self, with_final=True, with_resamplers=True): + def _iter(self, with_final=True, filter_passthrough=True): """ - Generate (name, trans) tuples excluding 'passthrough' transformers + Generate (idx, (name, trans)) tuples from self.steps + + When filter_passthrough is True, 'passthrough' and None transformers + are filtered out. """ stop = len(self.steps) if not with_final: stop -= 1 for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)): - is_passthrough = trans is None or trans == 'passthrough' - if not is_passthrough and (not hasattr(trans, 'fit_resample') or - (with_resamplers and - hasattr(trans, 'fit_resample'))): + if not filter_passthrough: yield idx, name, trans + elif trans is not None and trans != 'passthrough': + yield idx, name, trans + + def __len__(self): + """ + Returns the length of the Pipeline + """ + return len(self.steps) + + def __getitem__(self, ind): + """Returns a sub-pipeline or a single esimtator in the pipeline + + Indexing with an integer will return an estimator; using a slice + returns another Pipeline instance which copies a slice of this + Pipeline. This copy is shallow: modifying (or fitting) estimators in + the sub-pipeline will affect the larger pipeline and vice-versa. + However, replacing a value in `step` will not affect a copy. + """ + if isinstance(ind, slice): + if ind.step not in (1, None): + raise ValueError('Pipeline slicing only supports a step of 1') + return self.__class__(self.steps[ind]) + try: + name, est = self.steps[ind] + except TypeError: + # Not an int, try get step by name + return self.named_steps[ind] + return est @property def _estimator_type(self): @@ -221,6 +244,15 @@ def _final_estimator(self): estimator = self.steps[-1][1] return 'passthrough' if estimator is None else estimator + def _log_message(self, step_idx): + if not self.verbose: + return None + name, step = self.steps[step_idx] + + return '(step %d of %d) Processing %s' % (step_idx + 1, + len(self.steps), + name) + # Estimator interface def _fit(self, X, y=None, **fit_params): @@ -231,16 +263,28 @@ def _fit(self, X, y=None, **fit_params): memory = check_memory(self.memory) fit_transform_one_cached = memory.cache(_fit_transform_one) - fit_resample_one_cached = memory.cache(_fit_resample_one) fit_params_steps = {name: {} for name, step in self.steps if step is not None} for pname, pval in fit_params.items(): + if '__' not in pname: + raise ValueError( + "Pipeline.fit does not accept the {} parameter. " + "You can pass parameters to specific steps of your " + "pipeline using the stepname__parameter format, e.g. " + "`Pipeline.fit(X, y, logisticregression__sample_weight" + "=sample_weight)`.".format(pname)) step, param = pname.split('__', 1) fit_params_steps[step][param] = pval - Xt = X - yt = y - for step_idx, name, transformer in self._iter(with_final=False): + for (step_idx, + name, + transformer) in self._iter(with_final=False, + filter_passthrough=False): + if (transformer is None or transformer == 'passthrough'): + with _print_elapsed_time('Pipeline', + self._log_message(step_idx)): + continue + if hasattr(memory, 'location'): # joblib >= 0.12 if memory.location is None: @@ -260,28 +304,24 @@ def _fit(self, X, y=None, **fit_params): else: cloned_transformer = clone(transformer) # Fit or load from cache the current transfomer - if (hasattr(cloned_transformer, "transform") or - hasattr(cloned_transformer, "fit_transform")): - Xt, fitted_transformer = fit_transform_one_cached( - cloned_transformer, Xt, yt, None, - **fit_params_steps[name]) - elif hasattr(cloned_transformer, "fit_resample"): - Xt, yt, fitted_transformer = fit_resample_one_cached( - cloned_transformer, Xt, yt, **fit_params_steps[name]) + X, fitted_transformer = fit_transform_one_cached( + cloned_transformer, X, y, None, + message_clsname='Pipeline', + message=self._log_message(step_idx), + **fit_params_steps[name]) # Replace the transformer of the step with the fitted # transformer. This is necessary when loading the transformer # from the cache. self.steps[step_idx] = (name, fitted_transformer) if self._final_estimator == 'passthrough': - return Xt, yt, {} - return Xt, yt, fit_params_steps[self.steps[-1][0]] + return X, {} + return X, fit_params_steps[self.steps[-1][0]] def fit(self, X, y=None, **fit_params): """Fit the model - Fit all the transformers/resamplers one after the other and - transform/resample the data, then fit the transformed/resampled data - using the final estimator. + Fit all the transforms one after the other and transform the + data, then fit the transformed data using the final estimator. Parameters ---------- @@ -303,17 +343,19 @@ def fit(self, X, y=None, **fit_params): self : Pipeline This estimator """ - Xt, yt, fit_params = self._fit(X, y, **fit_params) - if self._final_estimator != 'passthrough': - self._final_estimator.fit(Xt, yt, **fit_params) + Xt, fit_params = self._fit(X, y, **fit_params) + with _print_elapsed_time('Pipeline', + self._log_message(len(self.steps) - 1)): + if self._final_estimator != 'passthrough': + self._final_estimator.fit(Xt, y, **fit_params) return self def fit_transform(self, X, y=None, **fit_params): """Fit the model and transform with the final estimator - Fits all the transformers/resamplers one after the other and - transforms/resamples the data, then uses fit_transform on the - transformed/resampled data with the final estimator. + Fits all the transforms one after the other and transforms the + data, then uses fit_transform on transformed data with the final + estimator. Parameters ---------- @@ -332,22 +374,23 @@ def fit_transform(self, X, y=None, **fit_params): Returns ------- - Xt : array-like, shape = [n_resampled_samples, n_transformed_features] + Xt : array-like, shape = [n_samples, n_transformed_features] Transformed samples """ last_step = self._final_estimator - Xt, yt, fit_params = self._fit(X, y, **fit_params) - if hasattr(last_step, 'fit_transform'): - return last_step.fit_transform(Xt, yt, **fit_params) - elif last_step == 'passthrough': - return Xt - else: - return last_step.fit(Xt, yt, **fit_params).transform(Xt) + Xt, fit_params = self._fit(X, y, **fit_params) + with _print_elapsed_time('Pipeline', + self._log_message(len(self.steps) - 1)): + if last_step == 'passthrough': + return Xt + if hasattr(last_step, 'fit_transform'): + return last_step.fit_transform(Xt, y, **fit_params) + else: + return last_step.fit(Xt, y, **fit_params).transform(Xt) @if_delegate_has_method(delegate='_final_estimator') def predict(self, X, **predict_params): - """Apply transforms to the data, and predict with the final estimator. - Note that resamplers are not applied. + """Apply transforms to the data, and predict with the final estimator Parameters ---------- @@ -367,16 +410,18 @@ def predict(self, X, **predict_params): ------- y_pred : array-like """ - Xt = self._transform(X, with_final=False, with_resamplers=False) + Xt = X + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) return self.steps[-1][-1].predict(Xt, **predict_params) @if_delegate_has_method(delegate='_final_estimator') def fit_predict(self, X, y=None, **fit_params): - """Applies fit_transforms of a pipeline to the data, followed by the - fit_predict method of the final estimator in the pipeline. + """Applies fit_predict of last step in pipeline after transforms. - Calling this method on a pipeline containing a resamplers is - unsupported. + Applies fit_transforms of a pipeline to the data, followed by the + fit_predict method of the final estimator in the pipeline. Valid + only if the final estimator implements fit_predict. Parameters ---------- @@ -397,19 +442,15 @@ def fit_predict(self, X, y=None, **fit_params): ------- y_pred : array-like """ - if self._resamplers_exist: - raise NotImplementedError("Pipelines containing resamplers that" - " have an estimator implementing " - "fit_predict as their last stage " - "are currently not supported.") - Xt, yt, fit_params = self._fit(X, y, **fit_params) - return self.steps[-1][-1].fit_predict(Xt, yt, **fit_params) + Xt, fit_params = self._fit(X, y, **fit_params) + with _print_elapsed_time('Pipeline', + self._log_message(len(self.steps) - 1)): + y_pred = self.steps[-1][-1].fit_predict(Xt, y, **fit_params) + return y_pred @if_delegate_has_method(delegate='_final_estimator') def predict_proba(self, X): - """Apply transforms, and predict_proba of the final estimator. Note - that resamplers are not applied. - + """Apply transforms, and predict_proba of the final estimator Parameters ---------- @@ -421,13 +462,14 @@ def predict_proba(self, X): ------- y_proba : array-like, shape = [n_samples, n_classes] """ - Xt = self._transform(X, with_resamplers=False, with_final=False) + Xt = X + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) return self.steps[-1][-1].predict_proba(Xt) @if_delegate_has_method(delegate='_final_estimator') def decision_function(self, X): - """Apply transforms, and decision_function of the final estimator. Note - that resamplers are not applied. + """Apply transforms, and decision_function of the final estimator Parameters ---------- @@ -439,13 +481,14 @@ def decision_function(self, X): ------- y_score : array-like, shape = [n_samples, n_classes] """ - Xt = self._transform(X, with_final=False, with_resamplers=False) + Xt = X + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) return self.steps[-1][-1].decision_function(Xt) @if_delegate_has_method(delegate='_final_estimator') - def predict_log_proba(self, X): - """Apply transforms, and predict_log_proba of the final estimator. Note - that resamplers are not applied. + def score_samples(self, X): + """Apply transforms, and score_samples of the final estimator. Parameters ---------- @@ -455,59 +498,37 @@ def predict_log_proba(self, X): Returns ------- - y_score : array-like, shape = [n_samples, n_classes] + y_score : ndarray, shape (n_samples,) """ - Xt = self._transform(X, with_final=False, with_resamplers=False) - return self.steps[-1][-1].predict_log_proba(Xt) - - @property - def fit_resample(self): - """Fit the model and resample with the final estimator + Xt = X + for _, _, transformer in self._iter(with_final=False): + Xt = transformer.transform(Xt) + return self.steps[-1][-1].score_samples(Xt) - Fits all the transformers/resamplers one after the other and - transforms/resamples the data, then uses fit_resample on the - transformed/resampled data with the final estimator. + @if_delegate_has_method(delegate='_final_estimator') + def predict_log_proba(self, X): + """Apply transforms, and predict_log_proba of the final estimator Parameters ---------- X : iterable - Training data. Must fulfill input requirements of first step of the - pipeline. - - y : iterable - Training targets. Must fulfill label requirements for all steps of - the pipeline. - - **fit_params : dict of string -> object - Parameters passed to the ``fit`` method of each step, where - each parameter name is prefixed such that parameter ``p`` for step - ``s`` has key ``s__p``. + Data to predict on. Must fulfill input requirements of first step + of the pipeline. Returns ------- - Xt : array-like, shape = [n_resampled_samples, n_transformed_features] - Transformed samples - - yt : array-like, shape = [n_resampled_samples, n_transformed_features] - Transformed target - + y_score : array-like, shape = [n_samples, n_classes] """ - if self._final_estimator != 'passthrough': - self._final_estimator.fit_resample - return self._fit_resample - - def _fit_resample(self, X, y, **fit_params): - last_step = self._final_estimator - Xt, yt, fit_params = self._fit(X, y, **fit_params) - if last_step == 'passthrough': - return Xt, yt - return last_step.fit_resample(Xt, yt, **fit_params) + Xt = X + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) + return self.steps[-1][-1].predict_log_proba(Xt) @property def transform(self): - """Apply transforms/resamples, and transform with the final estimator. + """Apply transforms, and transform with the final estimator - This also works where final estimator is `None`: all prior + This also works where final estimator is ``None``: all prior transformations are applied. Parameters @@ -518,28 +539,25 @@ def transform(self): Returns ------- - Xt : array-like, shape = [n_resampled_samples, n_transformed_features] + Xt : array-like, shape = [n_samples, n_transformed_features] """ # _final_estimator is None or has transform, otherwise attribute error # XXX: Handling the None case means we can't use if_delegate_has_method if self._final_estimator != 'passthrough': self._final_estimator.transform - return partial(self._transform, with_final=True, with_resamplers=True) + return self._transform - def _transform(self, X, with_resamplers=True, with_final=True): + def _transform(self, X): Xt = X - for _, _, transform in self._iter(with_final=with_final, - with_resamplers=with_resamplers): + for _, _, transform in self._iter(): Xt = transform.transform(Xt) return Xt @property def inverse_transform(self): - """Apply inverse transformations in reverse order. Note that resamplers - are skipped. + """Apply inverse transformations in reverse order - All estimators in the pipeline, except resamplers, must support - `inverse_transform`. + All estimators in the pipeline must support ``inverse_transform``. Parameters ---------- @@ -555,21 +573,20 @@ def inverse_transform(self): """ # raise AttributeError if necessary for hasattr behaviour # XXX: Handling the None case means we can't use if_delegate_has_method - for _, _, transform in self._iter(with_resamplers=False): + for _, _, transform in self._iter(): transform.inverse_transform return self._inverse_transform def _inverse_transform(self, X): Xt = X - reverse_iter = reversed(list(self._iter(with_resamplers=False))) + reverse_iter = reversed(list(self._iter())) for _, _, transform in reverse_iter: Xt = transform.inverse_transform(Xt) return Xt @if_delegate_has_method(delegate='_final_estimator') def score(self, X, y=None, sample_weight=None): - """Apply transforms, and score with the final estimator. Note that - resamplers are not applied. + """Apply transforms, and score with the final estimator Parameters ---------- @@ -589,7 +606,9 @@ def score(self, X, y=None, sample_weight=None): ------- score : float """ - Xt = self._transform(X, with_final=False, with_resamplers=False) + Xt = X + for _, name, transform in self._iter(with_final=False): + Xt = transform.transform(Xt) score_params = {} if sample_weight is not None: score_params['sample_weight'] = sample_weight @@ -642,14 +661,18 @@ def make_pipeline(*steps, **kwargs): *steps : list of estimators. memory : None, str or object with the joblib.Memory interface, optional - Used to cache the fitted transformers of the pipeline. By default, no - caching is performed. If a string is given, it is the path to the - caching directory. Enabling caching triggers a clone of the - transformers/resamplers before fitting. Therefore, the estimator - instance given to the pipeline cannot be inspected directly. Use the - attribute `named_steps` or `steps` to inspect estimators within the - pipeline. Caching the estimators is advantageous when fitting is time - consuming. + Used to cache the fitted transformers of the pipeline. By default, + no caching is performed. If a string is given, it is the path to + the caching directory. Enabling caching triggers a clone of + the transformers before fitting. Therefore, the transformer + instance given to the pipeline cannot be inspected + directly. Use the attribute ``named_steps`` or ``steps`` to + inspect estimators within the pipeline. Caching the + transformers is advantageous when fitting is time consuming. + + verbose : boolean, optional + If True, the time elapsed while fitting each step will be printed as it + is completed. See also -------- @@ -661,29 +684,19 @@ def make_pipeline(*steps, **kwargs): >>> from sklearn.naive_bayes import GaussianNB >>> from sklearn.preprocessing import StandardScaler >>> make_pipeline(StandardScaler(), GaussianNB(priors=None)) - ... # doctest: +NORMALIZE_WHITESPACE - Pipeline(memory=None, - steps=[('standardscaler', - StandardScaler(copy=True, with_mean=True, with_std=True)), - ('gaussiannb', - GaussianNB(priors=None, var_smoothing=1e-09))]) + Pipeline(steps=[('standardscaler', StandardScaler()), + ('gaussiannb', GaussianNB())]) Returns ------- p : Pipeline """ memory = kwargs.pop('memory', None) + verbose = kwargs.pop('verbose', False) if kwargs: raise TypeError('Unknown keyword arguments: "{}"' .format(list(kwargs.keys())[0])) - return Pipeline(_name_estimators(steps), memory=memory) - - -# weight and fit_params are not used but it allows _fit_one_transformer, -# _transform_one and _fit_transform_one to have the same signature to -# factorize the code in ColumnTransformer -def _fit_one_transformer(transformer, X, y, weight=None, **fit_params): - return transformer.fit(X, y) + return Pipeline(_name_estimators(steps), memory=memory, verbose=verbose) def _transform_one(transformer, X, y, weight, **fit_params): @@ -694,21 +707,41 @@ def _transform_one(transformer, X, y, weight, **fit_params): return res * weight -def _fit_transform_one(transformer, X, y, weight, **fit_params): - if hasattr(transformer, 'fit_transform'): - res = transformer.fit_transform(X, y, **fit_params) - else: - res = transformer.fit(X, y, **fit_params).transform(X) - # if we have a weight for this transformer, multiply output +def _fit_transform_one(transformer, + X, + y, + weight, + message_clsname='', + message=None, + **fit_params): + """ + Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned + with the fitted transformer. If ``weight`` is not ``None``, the result will + be multiplied by ``weight``. + """ + with _print_elapsed_time(message_clsname, message): + if hasattr(transformer, 'fit_transform'): + res = transformer.fit_transform(X, y, **fit_params) + else: + res = transformer.fit(X, y, **fit_params).transform(X) + if weight is None: return res, transformer return res * weight, transformer -def _fit_resample_one(sampler, X, y, **fit_params): - X_res, y_res = sampler.fit_resample(X, y, **fit_params) - - return X_res, y_res, sampler +def _fit_one(transformer, + X, + y, + weight, + message_clsname='', + message=None, + **fit_params): + """ + Fits ``transformer`` to ``X`` and ``y``. + """ + with _print_elapsed_time(message_clsname, message): + return transformer.fit(X, y, **fit_params) class FeatureUnion(_BaseComposition, TransformerMixin): @@ -741,6 +774,10 @@ class FeatureUnion(_BaseComposition, TransformerMixin): Multiplicative weights for features per transformer. Keys are transformer names, values the weights. + verbose : boolean, optional(default=False) + If True, the time elapsed while fitting each transformer will be + printed as it is completed. + See also -------- sklearn.pipeline.make_union : convenience function for simplified @@ -753,17 +790,18 @@ class FeatureUnion(_BaseComposition, TransformerMixin): >>> union = FeatureUnion([("pca", PCA(n_components=1)), ... ("svd", TruncatedSVD(n_components=2))]) >>> X = [[0., 1., 3], [2., 2., 5]] - >>> union.fit_transform(X) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + >>> union.fit_transform(X) array([[ 1.5 , 3.0..., 0.8...], [-1.5 , 5.7..., -0.4...]]) """ _required_parameters = ["transformer_list"] def __init__(self, transformer_list, n_jobs=None, - transformer_weights=None): + transformer_weights=None, verbose=False): self.transformer_list = transformer_list self.n_jobs = n_jobs self.transformer_weights = transformer_weights + self.verbose = verbose self._validate_transformers() def get_params(self, deep=True): @@ -854,11 +892,11 @@ def fit(self, X, y=None): self : FeatureUnion This estimator """ - self.transformer_list = list(self.transformer_list) - self._validate_transformers() - transformers = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_one_transformer)(trans, X, y) - for _, trans, _ in self._iter()) + transformers = self._parallel_func(X, y, {}, _fit_one) + if not transformers: + # All transformers are None + return self + self._update_transformer_list(transformers) return self @@ -879,23 +917,38 @@ def fit_transform(self, X, y=None, **fit_params): hstack of results of transformers. sum_n_components is the sum of n_components (output dimension) over transformers. """ - self._validate_transformers() - result = Parallel(n_jobs=self.n_jobs)( - delayed(_fit_transform_one)(trans, X, y, weight, - **fit_params) - for name, trans, weight in self._iter()) - - if not result: + results = self._parallel_func(X, y, fit_params, _fit_transform_one) + if not results: # All transformers are None return np.zeros((X.shape[0], 0)) - Xs, transformers = zip(*result) + + Xs, transformers = zip(*results) self._update_transformer_list(transformers) + if any(sparse.issparse(f) for f in Xs): Xs = sparse.hstack(Xs).tocsr() else: Xs = np.hstack(Xs) return Xs + def _log_message(self, name, idx, total): + if not self.verbose: + return None + return '(step %d of %d) Processing %s' % (idx, total, name) + + def _parallel_func(self, X, y, fit_params, func): + """Runs func in parallel on X and y""" + self.transformer_list = list(self.transformer_list) + self._validate_transformers() + transformers = list(self._iter()) + + return Parallel(n_jobs=self.n_jobs)(delayed(func)( + transformer, X, y, weight, + message_clsname='FeatureUnion', + message=self._log_message(name, idx, len(transformers)), + **fit_params) for idx, (name, transformer, + weight) in enumerate(transformers, 1)) + def transform(self, X): """Transform X separately by each transformer, concatenate results. @@ -946,6 +999,10 @@ def make_union(*transformers, **kwargs): ``-1`` means using all processors. See :term:`Glossary ` for more details. + verbose : boolean, optional(default=False) + If True, the time elapsed while fitting each transformer will be + printed as it is completed. + Returns ------- f : FeatureUnion @@ -959,22 +1016,16 @@ def make_union(*transformers, **kwargs): -------- >>> from sklearn.decomposition import PCA, TruncatedSVD >>> from sklearn.pipeline import make_union - >>> make_union(PCA(), TruncatedSVD()) # doctest: +NORMALIZE_WHITESPACE - FeatureUnion(n_jobs=None, - transformer_list=[('pca', - PCA(copy=True, iterated_power='auto', - n_components=None, random_state=None, - svd_solver='auto', tol=0.0, whiten=False)), - ('truncatedsvd', - TruncatedSVD(algorithm='randomized', - n_components=2, n_iter=5, - random_state=None, tol=0.0))], - transformer_weights=None) + >>> make_union(PCA(), TruncatedSVD()) + FeatureUnion(transformer_list=[('pca', PCA()), + ('truncatedsvd', TruncatedSVD())]) """ n_jobs = kwargs.pop('n_jobs', None) + verbose = kwargs.pop('verbose', False) if kwargs: # We do not currently support `transformer_weights` as we may want to # change its type spec in make_union raise TypeError('Unknown keyword arguments: "{}"' .format(list(kwargs.keys())[0])) - return FeatureUnion(_name_estimators(transformers), n_jobs=n_jobs) + return FeatureUnion( + _name_estimators(transformers), n_jobs=n_jobs, verbose=verbose) diff --git a/sklearn/tests/test_pipeline.py b/sklearn/tests/test_pipeline.py index 6e2b156c21d2a..b40ca7778f2fa 100644 --- a/sklearn/tests/test_pipeline.py +++ b/sklearn/tests/test_pipeline.py @@ -1,45 +1,42 @@ """ Test the pipeline module. """ - from distutils.version import LooseVersion from tempfile import mkdtemp import shutil import time +import re +import itertools import pytest import numpy as np from scipy import sparse +import joblib from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_raises_regex from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_allclose from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_dict_equal from sklearn.utils.testing import assert_no_warnings from sklearn.base import clone, BaseEstimator -from sklearn.pipeline import Pipeline, make_pipeline -from sklearn.pipeline import FeatureUnion, make_union +from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, make_union from sklearn.svm import SVC +from sklearn.neighbors import LocalOutlierFactor from sklearn.linear_model import LogisticRegression, Lasso from sklearn.linear_model import LinearRegression from sklearn.cluster import KMeans from sklearn.feature_selection import SelectKBest, f_classif from sklearn.dummy import DummyRegressor from sklearn.decomposition import PCA, TruncatedSVD -from sklearn.datasets import load_iris, make_classification +from sklearn.datasets import load_iris from sklearn.preprocessing import StandardScaler from sklearn.feature_extraction.text import CountVectorizer -from sklearn.utils._joblib import Memory -from sklearn.utils._joblib import __version__ as joblib_version - -from sklearn.svm import OneClassSVM -from sklearn.neighbors import LocalOutlierFactor -R_TOL = 1e-4 JUNK_FOOD_DOCS = ( "the pizza pizza beer copyright", @@ -160,33 +157,6 @@ def predict(self, X, got_attribute=False): return self -class DummyResampler(NoTrans): - """Resampler which returns the same samples.""" - - def fit_resample(self, X, y): - self.means_ = np.mean(X, axis=0) - # store timestamp to figure out whether the result of 'fit' has been - # cached or not - self.timestamp_ = time.time() - return X, y - - -class FitTransformResample(NoTrans): - """Estimator implementing both transform and fit_resample.""" - - def fit(self, X, y, should_succeed=False): - pass - - def fit_resample(self, X, y=None): - return X, y - - def fit_transform(self, X, y=None): - return self.fit(X, y).transform(X) - - def transform(self, X, y=None): - return X - - def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) @@ -216,10 +186,14 @@ def test_pipeline_init(): filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) + # Check that estimators are not cloned on pipeline construction + assert pipe.named_steps['anova'] is filter1 + assert pipe.named_steps['svc'] is clf + # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform assert_raises_regex(TypeError, - 'All intermediate steps of Pipeline ' + 'All intermediate steps should be transformers' '.*\\bNoTrans\\b.*', Pipeline, [('t', NoTrans()), ('svc', clf)]) @@ -266,8 +240,6 @@ def test_pipeline_init_tuple(): pipe.score(X) -@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 -@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() @@ -353,7 +325,7 @@ def test_pipeline_methods_pca_svm(): X = iris.data y = iris.target # Test with PCA + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) + clf = SVC(probability=True, random_state=0) pca = PCA(svd_solver='full', n_components='mle', whiten=True) pipe = Pipeline([('pca', pca), ('svc', clf)]) pipe.fit(X, y) @@ -363,6 +335,36 @@ def test_pipeline_methods_pca_svm(): pipe.score(X, y) +def test_pipeline_score_samples_pca_lof(): + iris = load_iris() + X = iris.data + # Test that the score_samples method is implemented on a pipeline. + # Test that the score_samples method on pipeline yields same results as + # applying transform and score_samples steps separately. + pca = PCA(svd_solver='full', n_components='mle', whiten=True) + lof = LocalOutlierFactor(novelty=True) + pipe = Pipeline([('pca', pca), ('lof', lof)]) + pipe.fit(X) + # Check the shapes + assert pipe.score_samples(X).shape == (X.shape[0],) + # Check the values + lof.fit(pca.fit_transform(X)) + assert_allclose(pipe.score_samples(X), lof.score_samples(pca.transform(X))) + + +def test_score_samples_on_pipeline_without_score_samples(): + X = np.array([[1], [2]]) + y = np.array([1, 2]) + # Test that a pipeline does not have score_samples method when the final + # step of the pipeline does not have score_samples defined. + pipe = make_pipeline(LogisticRegression()) + pipe.fit(X, y) + with pytest.raises(AttributeError, + match="'LogisticRegression' object has no attribute " + "'score_samples'"): + pipe.score_samples(X) + + def test_pipeline_methods_preprocessing_svm(): # Test the various methods of the pipeline (preprocessing + svm). iris = load_iris() @@ -372,8 +374,7 @@ def test_pipeline_methods_preprocessing_svm(): n_classes = len(np.unique(y)) scaler = StandardScaler() pca = PCA(n_components=2, svd_solver='randomized', whiten=True) - clf = SVC(gamma='scale', probability=True, random_state=0, - decision_function_shape='ovr') + clf = SVC(probability=True, random_state=0, decision_function_shape='ovr') for preprocessing in [scaler, pca]: pipe = Pipeline([('preprocess', preprocessing), ('svc', clf)]) @@ -562,6 +563,29 @@ def test_pipeline_fit_transform(): assert_array_almost_equal(X_trans, X_trans2) +def test_pipeline_slice(): + pipe = Pipeline([('transf1', Transf()), + ('transf2', Transf()), + ('clf', FitParamT())]) + pipe2 = pipe[:-1] + assert isinstance(pipe2, Pipeline) + assert pipe2.steps == pipe.steps[:-1] + assert 2 == len(pipe2.named_steps) + assert_raises(ValueError, lambda: pipe[::-1]) + + +def test_pipeline_index(): + transf = Transf() + clf = FitParamT() + pipe = Pipeline([('transf', transf), ('clf', clf)]) + assert pipe[0] == transf + assert pipe['transf'] == transf + assert pipe[-1] == clf + assert pipe['clf'] == clf + assert_raises(IndexError, lambda: pipe[3]) + assert_raises(KeyError, lambda: pipe['foobar']) + + def test_set_pipeline_steps(): transf1 = Transf() transf2 = Transf() @@ -657,6 +681,7 @@ def make(): 'memory': None, 'm2__mult': 2, 'last__mult': 5, + 'verbose': False }) pipeline.set_params(m2=passthrough) @@ -841,8 +866,6 @@ def test_feature_union_feature_names(): 'get_feature_names', ft.get_feature_names) -@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 -@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_classes_property(): iris = load_iris() X = iris.data @@ -953,8 +976,6 @@ def test_step_name_validation(): [[1]], [1]) -@pytest.mark.filterwarnings('ignore: Default solver will be changed') # 0.22 -@pytest.mark.filterwarnings('ignore: Default multi_class will') # 0.22 def test_set_params_nested_pipeline(): estimator = Pipeline([ ('a', Pipeline([ @@ -1008,13 +1029,13 @@ def test_pipeline_memory(): y = iris.target cachedir = mkdtemp() try: - if LooseVersion(joblib_version) < LooseVersion('0.12'): + if LooseVersion(joblib.__version__) < LooseVersion('0.12'): # Deal with change of API in joblib - memory = Memory(cachedir=cachedir, verbose=10) + memory = joblib.Memory(cachedir=cachedir, verbose=10) else: - memory = Memory(location=cachedir, verbose=10) + memory = joblib.Memory(location=cachedir, verbose=10) # Test with Transformer + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) + clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], @@ -1048,7 +1069,7 @@ def test_pipeline_memory(): assert_equal(ts, cached_pipe.named_steps['transf'].timestamp_) # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit - clf_2 = SVC(gamma='scale', probability=True, random_state=0) + clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) @@ -1068,366 +1089,77 @@ def test_pipeline_memory(): shutil.rmtree(cachedir) -def test_pipeline_memory_resampler(): - # TODO - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - cachedir = mkdtemp() - try: - memory = Memory(cachedir, verbose=10) - # Test with Transformer + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) - transf = DummyResampler() - pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) - cached_pipe = Pipeline( - [('transf', transf), ('svc', clf)], memory=memory) - - # Memoize the transformer at the first fit - cached_pipe.fit(X, y) - pipe.fit(X, y) - # Get the time stamp of the tranformer in the cached pipeline - expected_ts = cached_pipe.named_steps['transf'].timestamp_ - # Check that cached_pipe and pipe yield identical results - assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) - assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) - assert_array_equal( - pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) - assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe.named_steps['transf'].means_) - assert not hasattr(transf, 'means_') - # Check that we are reading the cache while fitting - # a second time - cached_pipe.fit(X, y) - # Check that cached_pipe and pipe yield identical results - assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) - assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) - assert_array_equal( - pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) - assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe.named_steps['transf'].means_) - assert cached_pipe.named_steps['transf'].timestamp_ == expected_ts - # Create a new pipeline with cloned estimators - # Check that even changing the name step does not affect the cache hit - clf_2 = SVC(gamma='scale', probability=True, random_state=0) - transf_2 = DummyResampler() - cached_pipe_2 = Pipeline( - [('transf_2', transf_2), ('svc', clf_2)], memory=memory) - cached_pipe_2.fit(X, y) - - # Check that cached_pipe and pipe yield identical results - assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) - assert_array_equal( - pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) - assert_array_equal( - pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) - assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) - assert_array_equal(pipe.named_steps['transf'].means_, - cached_pipe_2.named_steps['transf_2'].means_) - assert cached_pipe_2.named_steps['transf_2'].timestamp_ == expected_ts - finally: - shutil.rmtree(cachedir) - - -def test_pipeline_methods_pca_outlier_svm(): - # Test the various methods of the pipeline (pca + svm). - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - - # Test with PCA + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) - pca = PCA() - outlier = OneClassSVM(gamma='scale') - pipe = Pipeline([('pca', pca), ('outlier', outlier), ('svc', clf)]) - pipe.fit(X, y) - pipe.predict(X) - pipe.predict_proba(X) - pipe.predict_log_proba(X) - pipe.score(X, y) - - -def test_pipeline_methods_outlier_pca_svm(): - # Test the various methods of the pipeline (pca + svm). - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - - # Test with PCA + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) - pca = PCA() - outlier = OneClassSVM(gamma='scale') - pipe = Pipeline([('outlier', outlier), ('pca', pca), ('svc', clf)]) - pipe.fit(X, y) - pipe.predict(X) - pipe.predict_proba(X) - pipe.predict_log_proba(X) - pipe.score(X, y) - - -def test_pipeline_resample(): - # Test whether pipeline works with a resampler at the end. - # Also test pipeline.fit_resample - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - - resampler = OneClassSVM(gamma='scale') - pipeline = Pipeline([('resampler', resampler)]) - - # test transform and fit_transform: - X_trans, y_trans = pipeline.fit_resample(X, y) - X_trans2, y_trans2 = resampler.fit_resample(X, y) - assert_array_equal(X_trans, X_trans2) - assert_array_equal(y_trans, y_trans2) - - pca = PCA() - pipeline = Pipeline([('pca', PCA()), ('resampler', resampler)]) - - X_trans, y_trans = pipeline.fit_resample(X, y) - X_pca = pca.fit_transform(X) - X_trans2, y_trans2 = resampler.fit_resample(X_pca, y) - assert_array_equal(X_trans, X_trans2) - assert_array_equal(y_trans, y_trans2) - - -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) -def test_pipeline_none_classifier(passthrough): - # Test pipeline using None as preprocessing step and a classifier - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - clf = LogisticRegression(solver='lbfgs', random_state=0) - pipe = make_pipeline(passthrough, clf) - pipe.fit(X, y) - pipe.predict(X) - pipe.predict_proba(X) - pipe.decision_function(X) - pipe.score(X, y) - - -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) -def test_pipeline_none_resampler_classifier(passthrough): - # Test pipeline using None, an outlier rejector and a classifier - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - clf = LogisticRegression(solver='lbfgs', random_state=0) - outlier = OneClassSVM(gamma='scale') - pipe = make_pipeline(passthrough, outlier, clf) - pipe.fit(X, y) - pipe.predict(X) - pipe.predict_proba(X) - pipe.decision_function(X) - pipe.score(X, y) - - -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) -def test_pipeline_resampler_none_classifier(passthrough): - # Test pipeline using an outlier rejector, None and a classifier - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - clf = LogisticRegression(solver='lbfgs', random_state=0) - outlier = OneClassSVM(gamma='scale') - pipe = make_pipeline(outlier, passthrough, clf) - pipe.fit(X, y) - pipe.predict(X) - pipe.predict_proba(X) - pipe.decision_function(X) - pipe.score(X, y) - - -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) -def test_pipeline_none_resampler_resample(passthrough): - # Test pipeline using None step and a resampler - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - - outlier = OneClassSVM(gamma='scale') - pipe = make_pipeline(passthrough, outlier) - pipe.fit_resample(X, y) - - -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) -def test_pipeline_none_transformer(passthrough): - # Test pipeline using None and a transformer that implements transform and - # inverse_transform - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - - pca = PCA(whiten=True) - pipe = make_pipeline(passthrough, pca) - pipe.fit(X, y) - X_trans = pipe.transform(X) - X_inversed = pipe.inverse_transform(X_trans) - assert_array_almost_equal(X, X_inversed) - - -def test_pipeline_methods_anova_outlier(): - # Test the various methods of the pipeline (anova). - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - # Test with outlierdetection + Anova + LogisticRegression - clf = LogisticRegression(solver='lbfgs') - outlier = OneClassSVM(gamma='scale') - filter1 = SelectKBest(f_classif, k=2) - pipe = Pipeline([('outlier', outlier), - ('anova', filter1), - ('logistic', clf)]) - pipe.fit(X, y) - pipe.predict(X) - pipe.predict_proba(X) - pipe.predict_log_proba(X) - pipe.score(X, y) - - -def test_pipeline_with_step_that_implements_both_sample_and_transform(): - # Test the various methods of the pipeline (anova). - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - - clf = LogisticRegression(solver='lbfgs') - with pytest.raises(TypeError, match='should be estimators that implement'): - Pipeline([('step', FitTransformResample()), ('logistic', clf)]) - - -def test_pipeline_fit_then_sample_with_resampler_last_estimator(): - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - - outlier1 = OneClassSVM(gamma='scale') - outlier2 = LocalOutlierFactor(contamination=0.1) - pipeline = make_pipeline(outlier1, outlier2) - X_fit_resample_resampled, y_fit_resample_resampled = \ - pipeline.fit_resample(X, y) - pipeline = make_pipeline(outlier1, outlier2) - pipeline.fit(X, y) - X_fit_then_sample_res, y_fit_then_sample_res = pipeline.fit_resample(X, y) - assert_array_equal(X_fit_resample_resampled, X_fit_then_sample_res) - assert_array_equal(y_fit_resample_resampled, y_fit_then_sample_res) - - -def test_pipeline_fit_then_sample_3_resamplers_with_resampler_last_estimator(): - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) - - outlier1 = OneClassSVM(gamma='scale') - outlier2 = LocalOutlierFactor(contamination=0.1) - pipeline = make_pipeline(outlier2, outlier1, outlier2) - X_fit_resample, y_fit_resample = pipeline.fit_resample(X, y) - pipeline.fit(X, y) - X_fit_then_resample, y_fit_then_resample = pipeline.fit_resample(X, y) - - assert_array_equal(X_fit_resample, X_fit_then_resample) - assert_array_equal(y_fit_resample, y_fit_then_resample) - - def test_make_pipeline_memory(): cachedir = mkdtemp() - if LooseVersion(joblib_version) < LooseVersion('0.12'): + if LooseVersion(joblib.__version__) < LooseVersion('0.12'): # Deal with change of API in joblib - memory = Memory(cachedir=cachedir, verbose=10) + memory = joblib.Memory(cachedir=cachedir, verbose=10) else: - memory = Memory(location=cachedir, verbose=10) + memory = joblib.Memory(location=cachedir, verbose=10) pipeline = make_pipeline(DummyTransf(), SVC(), memory=memory) assert pipeline.memory is memory pipeline = make_pipeline(DummyTransf(), SVC()) assert pipeline.memory is None + assert len(pipeline) == 2 shutil.rmtree(cachedir) -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) -def test_outlier_shape_correct_after_resample(passthrough): - X, y = make_classification() - - outlier = OneClassSVM(gamma='scale') - pca = PCA() - pipe = make_pipeline(outlier, passthrough) - pipe2 = make_pipeline(outlier, pca) - - outliers = outlier.fit_predict(X, y) == -1 - n_outliers = np.sum(outliers) - assert n_outliers > 0 # we have some outliers in the dataset - - X_new, y_new = pipe.fit_resample(X, y) - X_new2 = pipe2.fit_transform(X, y) - - assert X_new.shape[0] == X.shape[0] - n_outliers - assert y_new.shape[0] == y.shape[0] - n_outliers - assert X_new2.shape[0] == X.shape[0] - n_outliers - - -def test_resamplers_not_called(): - X, y = make_classification(n_samples=10) - - mul2 = Mult(2) - dre = DummyResampler() - mul3 = Mult(3) - - pipe = make_pipeline(mul2, dre, mul3) - pipe.fit(X, y) - - assert hasattr(dre, "means_") - delattr(dre, "means_") - - pipe.predict(X) - pipe.score(X) - assert not hasattr(dre, "means_") - - pipe.fit_transform(X, y) - assert hasattr(dre, "means_") - delattr(dre, "means_") - - pipe.fit(X, y) - assert hasattr(dre, "means_") +def test_pipeline_param_error(): + clf = make_pipeline(LogisticRegression()) + with pytest.raises(ValueError, match="Pipeline.fit does not accept " + "the sample_weight parameter"): + clf.fit([[0], [0]], [0, 1], sample_weight=[1, 1]) + + +parameter_grid_test_verbose = ((est, pattern, method) for + (est, pattern), method in itertools.product( + [ + (Pipeline([('transf', Transf()), ('clf', FitParamT())]), + r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n' + r'\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$'), + (Pipeline([('transf', Transf()), ('noop', None), + ('clf', FitParamT())]), + r'\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n' + r'\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n' + r'\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$'), + (Pipeline([('transf', Transf()), ('noop', 'passthrough'), + ('clf', FitParamT())]), + r'\[Pipeline\].*\(step 1 of 3\) Processing transf.* total=.*\n' + r'\[Pipeline\].*\(step 2 of 3\) Processing noop.* total=.*\n' + r'\[Pipeline\].*\(step 3 of 3\) Processing clf.* total=.*\n$'), + (Pipeline([('transf', Transf()), ('clf', None)]), + r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n' + r'\[Pipeline\].*\(step 2 of 2\) Processing clf.* total=.*\n$'), + (Pipeline([('transf', None), ('mult', Mult())]), + r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n' + r'\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$'), + (Pipeline([('transf', 'passthrough'), ('mult', Mult())]), + r'\[Pipeline\].*\(step 1 of 2\) Processing transf.* total=.*\n' + r'\[Pipeline\].*\(step 2 of 2\) Processing mult.* total=.*\n$'), + (FeatureUnion([('mult1', Mult()), ('mult2', Mult())]), + r'\[FeatureUnion\].*\(step 1 of 2\) Processing mult1.* total=.*\n' + r'\[FeatureUnion\].*\(step 2 of 2\) Processing mult2.* total=.*\n$'), + (FeatureUnion([('mult1', None), ('mult2', Mult()), ('mult3', None)]), + r'\[FeatureUnion\].*\(step 1 of 1\) Processing mult2.* total=.*\n$') + ], ['fit', 'fit_transform', 'fit_predict']) + if hasattr(est, method) and not ( + method == 'fit_transform' and hasattr(est, 'steps') and + isinstance(est.steps[-1][1], FitParamT)) +) -def test_clusterer_and_resampler_error(): - X, y = make_classification(n_samples=10) +@pytest.mark.parametrize('est, pattern, method', parameter_grid_test_verbose) +def test_verbose(est, method, pattern, capsys): + func = getattr(est, method) - dre = DummyResampler() - pipe = make_pipeline(dre, KMeans()) - msg = "have an estimator implementing fit_predict as their last stage" - with pytest.raises(NotImplementedError, - match=msg): - pipe.fit_predict(X, y) + X = [[1, 2, 3], [4, 5, 6]] + y = [[7], [8]] + est.set_params(verbose=False) + func(X, y) + assert not capsys.readouterr().out, 'Got output for verbose=False' -@pytest.mark.parametrize('passthrough', [None, 'passthrough']) -def test_pipe_exposes_resample_correctly(passthrough): - # this test will be handled by test_metaestimators later, it's just here - # now for simplicity - # TODO make this test pass (maybe something similar is also broken for - # fit_transform - X, y = make_classification(n_samples=10) - - dre = DummyResampler() - mul3 = Mult(3) - pipe = make_pipeline(dre, mul3) - pipe2 = make_pipeline(dre, passthrough) - - assert not hasattr(pipe, 'fit_resample') - assert hasattr(pipe2, 'fit_resample') + est.set_params(verbose=True) + func(X, y) + assert re.match(pattern, capsys.readouterr().out) From 24de6e216b7a88ae33239d6573d7609b027f3e20 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 26 Jun 2019 01:11:20 +1000 Subject: [PATCH 24/46] WIP ResampledTransformer --- doc/modules/classes.rst | 1 + doc/modules/compose.rst | 92 ++++++++++++------------------- sklearn/base.py | 6 +- sklearn/compose/__init__.py | 2 + sklearn/utils/__init__.py | 2 + sklearn/utils/estimator_checks.py | 2 +- 6 files changed, 44 insertions(+), 61 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index 4750ccdeb8dad..2cc1335c22765 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -166,6 +166,7 @@ details. :template: class.rst compose.ColumnTransformer + compose.ResampledTrainer compose.TransformedTargetRegressor .. autosummary:: diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 379a3a01da5c1..98c94df5b8410 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -162,63 +162,6 @@ i.e. if the last estimator is a classifier, the :class:`Pipeline` can be used as a classifier. If the last estimator is a transformer, again, so is the pipeline. -.. _pipeline_resamplers: - -Resamplers in pipelines ------------------------ -In addition to transformers, pipelines also support :ref:`resamplers` as -intermediate steps. However, unlike for transformers, pipelines do not always -call resamplers when data flows through the pipeline. In summary: - -===================== ================================ -Method Resamplers applied -===================== ================================ -``fit`` Yes -``fit_transform`` Yes -``transform`` Yes -``predict`` No -``score`` No -``fit_predict`` not supported (see note) -===================== ================================ - Note) - -To understand why, consider the example of :ref:`outlier rejectors`. These -resamplers will remove samples from the dataset if they classified as outliers. -Consider the following pipeline:: - - >>> from sklearn.pipeline import make_pipeline - >>> from sklearn.covariance import EllipticEnvelope - >>> from sklearn.linear_model import LogisticRegression - >>> pipe = make_pipeline(EllipticEnvelope(), LogisticRegression()) # doctest: +NORMALIZE_WHITESPACE - >>> pipe.fit(X_train, y_train) - -In ``pipe``, we would remove outliers before fitting our `LogisticRegression` -model, so that the samples passed to fit come from the same distribution. We do -this to improve the quality of the fit (see :ref:`_outlier_detection`). -Therefore, during ``fit``, we want our resampler to be applied. - -Now assume that we would like to make predictions on some new data ``X_test``:: - - >>> predictions = pipe.predict(X_test) - -If we applied our resampler, it would remove outliers from ``X_test``. This is -nonsensical for two reasons: -1. We would not get predictions for samples that were classified as outliers. -2. Since resamplers are always fitted on the data they will predict on, the - notion of an outlier in the ``X_test`` is not consistent with the notion of - an outlier in ``X_train``. A sample could be an outlier in ``X_train``, but - an inlier in ``X_test``, depending on the other samples passed. - -Therefore, all methods in which predictions are made will skip resamplers in -the pipeline. - -.. note:: - - If a pipeline contains resamplers, you may not call :term:`fit_predict` on - it. For reasons described above, it is not practical to apply resamplers - when predictions are being made. Therefore, we would not be able to apply - the resamplers in such a call. - .. _pipeline_cache: Caching transformers: avoid repeated computation @@ -295,6 +238,41 @@ object:: * :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py` +.. _pipeline_resamplers: + +Resampling or modifying samples in training +=========================================== + +All transformers in a Pipeline must output a dataset with samples corresponding +to their input. Sometimes you want a process to modify the set of samples +used in training, such as balanced resampling, outlier remover, or data +augmentation/perturbation. Such processes are called Resamplers, rather than +Transformers, in Scikit-learn, and should be composed with a predictor using +a :class:`compose.ResampledTrainer` rather than a Pipeline. Resamplers provide +a `fit_resample` method which is called by the ``ResampledTrainer`` when +fitting, so that the resampled data is used to train the subsequent predictor. + +:ref:`outlier rejectors` provide `fit_resample` methods that remove samples +from the dataset if they classified as outliers. Consider the following:: + + >>> from sklearn.compose import ResampledTrainer + >>> from sklearn.covariance import EllipticEnvelope + >>> from sklearn.linear_model import LogisticRegression + >>> pipe = ResampledTrainer(EllipticEnvelope(), LogisticRegression()) + >>> pipe.fit(X_train, y_train) + +In ``pipe``, we remove outliers before fitting our `LogisticRegression` +model, so that the samples passed to fit come from the same distribution. We do +this to improve the quality of the fit (see :ref:`_outlier_detection`). +Therefore, during ``fit``, we want our resampler to be applied. + +Now assume that we would like to make predictions on some new data ``X_test``:: + + >>> predictions = pipe.predict(X_test) + +This does not apply resampling, but provides predictions for all samples in +``X_test``. + .. _transformed_target_regressor: Transforming target in regression diff --git a/sklearn/base.py b/sklearn/base.py index e133c6a6555bc..0f477067b796a 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -14,7 +14,7 @@ from . import __version__ from sklearn.utils import _IS_32BIT -from sklearn.utils import safe_mask +from sklearn.utils import safe_indexing _DEFAULT_TAGS = { 'non_deterministic': False, @@ -630,9 +630,9 @@ def fit_resample(self, X, y): The original y with outlier samples removed. """ - inliers = safe_mask(X, self.fit_predict(X) == 1) + inliers = self.fit_predict(X) == 1 - return X[inliers], y[inliers] + return safe_indexing(X, inliers), safe_indexing(y, inliers) class MetaEstimatorMixin: diff --git a/sklearn/compose/__init__.py b/sklearn/compose/__init__.py index 1cfd53c50d682..80e537c741de2 100644 --- a/sklearn/compose/__init__.py +++ b/sklearn/compose/__init__.py @@ -7,10 +7,12 @@ from ._column_transformer import ColumnTransformer, make_column_transformer from ._target import TransformedTargetRegressor +from ._resampled import ResampledTrainer __all__ = [ 'ColumnTransformer', 'make_column_transformer', 'TransformedTargetRegressor', + 'ResampledTrainer', ] diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index daf7e7763235d..d54519a7318a7 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -217,6 +217,8 @@ def safe_indexing(X, indices): indices.dtype.kind == 'i'): # This is often substantially faster than X[indices] return X.take(indices, axis=0) + elif getattr(X, 'format', None) in {'dia', 'bsr', 'coo'}: + return X.tocsr()[indices] else: return X[indices] else: diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 8d00d6c8e0641..1e9e9e7a67d3d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1190,7 +1190,7 @@ def check_estimators_empty_data_messages(name, estimator_orig): # the following y should be accepted by both classifiers and regressors # and ignored by unsupervised models y = multioutput_estimator_convert_y_2d(e, np.array([1, 0, 1])) - msg = (r"0 feature\(s\) \(shape=\(3, 0\)\) while a minimum of \d* " + msg = (r"0 feature\(s\) \(shape=\(\d*, 0\)\) while a minimum of \d* " "is required.") assert_raises_regex(ValueError, msg, e.fit, X_zero_features, y) From 19023ea730db674e9fceb4257808f36fed4b6958 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 26 Jun 2019 12:36:07 +1000 Subject: [PATCH 25/46] Add missing ResampledTrainer files --- sklearn/compose/_resampled.py | 99 +++++++++++++++++++++++++ sklearn/compose/tests/test_resampled.py | 60 +++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 sklearn/compose/_resampled.py create mode 100644 sklearn/compose/tests/test_resampled.py diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py new file mode 100644 index 0000000000000..db2c2e6fb7e98 --- /dev/null +++ b/sklearn/compose/_resampled.py @@ -0,0 +1,99 @@ +from ..base import BaseEstimator, MetaEstimatorMixin, clone +from ..utils.metaestimators import if_delegate_has_method + + +class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): + """Composition of a resampler and a predictor/transfomer + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + resampler : Estimator supporting fit_resample + predictor : Estimator + + Attributes + ---------- + resampler_ : Estimator + Fitted clone of `resampler`. + + predictor_ : Estimator + Fitted clone of `predictor`. + + Examples + -------- + >>> from sklearn.base import BaseEstimator + >>> from sklearn.compose import ResampledTrainer + >>> from sklearn.datasets import load_iris + >>> from sklearn.linear_model import LogisticRegression + >>> + >>> class HalfSampler(BaseEstimator): + ... "Train with every second sample" + ... def fit_resample(self, X, y, **kw): + ... return X[::2], y[::2] + >>> + >>> est = ResampledTrainer(HalfSampler(), LogisticRegression()) + >>> est.fit(X, y) + >>> est.predict(X[:2]) + """ + + def __init__(self, resampler, predictor): + self.resampler = resampler + self.predictor = predictor + + # TODO: tags? + + def fit(self, X, y=None, **kw): + self.resampler_ = clone(self.resampler) + ret = self.resampler_.fit_resample(X, y, **kw) + if len(ret) == 2: + kw = {} + X, y = ret + else: + X, y, kw = ret + self.predictor_ = clone(self.predictor).fit(X, y, **kw) + return self + + @if_delegate_has_method(delegate='predictor_') + def predict(self, X, **predict_params): + return self.predictor_.predict(X, **predict_params) + + @if_delegate_has_method(delegate='predictor_') + def predict_proba(self, X): + return self.predictor_.predict_proba(X) + + @if_delegate_has_method(delegate='predictor_') + def predict_log_proba(self, X): + return self.predictor_.predict_log_proba(X) + + @if_delegate_has_method(delegate='predictor_') + def decision_function(self, X): + return self.predictor_.decision_function(X) + + @if_delegate_has_method(delegate='predictor_') + def score(self, X, y, **kw): + return self.predictor_.score(X, y, **kw) + + @property + def fit_transform(self): + transform = self.predictor_.transform + + def fit_transform(X, y, **kwargs): + self.fit(X, y, **kwargs) + return transform(X) + + @property + def fit_predict(self): + predict = self.predictor_.predict + + def fit_predict(X, y, **kwargs): + self.fit(X, y, **kwargs) + return predict(X) + + @property + def _estimator_type(self): + return self.predictor._estimator_type + + @property + def classes_(self): + return self.predictor_.classes_ diff --git a/sklearn/compose/tests/test_resampled.py b/sklearn/compose/tests/test_resampled.py new file mode 100644 index 0000000000000..fa488891f4ac3 --- /dev/null +++ b/sklearn/compose/tests/test_resampled.py @@ -0,0 +1,60 @@ +from sklearn.base import BaseEstimator +from sklearn.datasets import make_classification +from sklearn.svm import SVC, OneClassSVM +from sklearn.decomposition import PCA +from sklearn.pipeline import Pipeline +from sklearn.compose import ResampledTrainer +from sklearn.utils.estimator_checks import check_estimator +from sklearn.utils.validation import _num_samples + + +class HalfSampler(BaseEstimator): + "Train with every odd sample" + def fit_resample(self, X, y, **kw): + if _num_samples(X) > 1 and getattr(X, 'format', None) not in ['dia', + 'coo', + 'bsr']: + return X[::2], y[::2] + return X, y + + +def test_estimator_checks(): + check_estimator(ResampledTrainer(HalfSampler(), SVC())) + + +def test_pca_outlier_svm(): + # Test the various methods of the pipeline (pca + svm). + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) + + # Test with PCA + SVC + clf = SVC(gamma='scale', probability=True, random_state=0) + pca = PCA() + outlier = OneClassSVM(gamma='scale') + pipe = Pipeline([('pca', pca), ('clf', ResampledTrainer(outlier, clf))]) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.predict_log_proba(X) + pipe.score(X, y) + + +def test_outlier_pca_svm(): + # Test the various methods of the pipeline (pca + svm). + X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], + n_informative=3, n_redundant=1, flip_y=0, + n_features=20, n_clusters_per_class=1, + n_samples=500, random_state=0) + + # Test with PCA + SVC + clf = SVC(gamma='scale', probability=True, random_state=0) + pca = PCA() + outlier = OneClassSVM(gamma='scale') + pipe = ResampledTrainer(outlier, Pipeline([('pca', pca), ('svc', clf)])) + pipe.fit(X, y) + pipe.predict(X) + pipe.predict_proba(X) + pipe.predict_log_proba(X) + pipe.score(X, y) From 082560a65148d2b2b661c663447ca73473057fbf Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 26 Jun 2019 18:30:24 +1000 Subject: [PATCH 26/46] predictor -> estimator; _required_parameters to skip common test --- sklearn/compose/_resampled.py | 44 +++++++++++++------------ sklearn/compose/tests/test_resampled.py | 9 +++-- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index db2c2e6fb7e98..7ddadfcd86a34 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -3,22 +3,22 @@ class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): - """Composition of a resampler and a predictor/transfomer + """Composition of a resampler and a estimator Read more in the :ref:`User Guide `. Parameters ---------- resampler : Estimator supporting fit_resample - predictor : Estimator + estimator : Estimator Attributes ---------- resampler_ : Estimator Fitted clone of `resampler`. - predictor_ : Estimator - Fitted clone of `predictor`. + estimator_ : Estimator + Fitted clone of `estimator`. Examples -------- @@ -37,9 +37,11 @@ class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): >>> est.predict(X[:2]) """ - def __init__(self, resampler, predictor): + def __init__(self, resampler, estimator): self.resampler = resampler - self.predictor = predictor + self.estimator = estimator + + _required_parameters = ['resampler', 'estimator'] # TODO: tags? @@ -51,32 +53,32 @@ def fit(self, X, y=None, **kw): X, y = ret else: X, y, kw = ret - self.predictor_ = clone(self.predictor).fit(X, y, **kw) + self.estimator_ = clone(self.estimator).fit(X, y, **kw) return self - @if_delegate_has_method(delegate='predictor_') + @if_delegate_has_method(delegate='estimator_') def predict(self, X, **predict_params): - return self.predictor_.predict(X, **predict_params) + return self.estimator_.predict(X, **predict_params) - @if_delegate_has_method(delegate='predictor_') + @if_delegate_has_method(delegate='estimator_') def predict_proba(self, X): - return self.predictor_.predict_proba(X) + return self.estimator_.predict_proba(X) - @if_delegate_has_method(delegate='predictor_') + @if_delegate_has_method(delegate='estimator_') def predict_log_proba(self, X): - return self.predictor_.predict_log_proba(X) + return self.estimator_.predict_log_proba(X) - @if_delegate_has_method(delegate='predictor_') + @if_delegate_has_method(delegate='estimator_') def decision_function(self, X): - return self.predictor_.decision_function(X) + return self.estimator_.decision_function(X) - @if_delegate_has_method(delegate='predictor_') + @if_delegate_has_method(delegate='estimator_') def score(self, X, y, **kw): - return self.predictor_.score(X, y, **kw) + return self.estimator_.score(X, y, **kw) @property def fit_transform(self): - transform = self.predictor_.transform + transform = self.estimator_.transform def fit_transform(X, y, **kwargs): self.fit(X, y, **kwargs) @@ -84,7 +86,7 @@ def fit_transform(X, y, **kwargs): @property def fit_predict(self): - predict = self.predictor_.predict + predict = self.estimator_.predict def fit_predict(X, y, **kwargs): self.fit(X, y, **kwargs) @@ -92,8 +94,8 @@ def fit_predict(X, y, **kwargs): @property def _estimator_type(self): - return self.predictor._estimator_type + return self.estimator._estimator_type @property def classes_(self): - return self.predictor_.classes_ + return self.estimator_.classes_ diff --git a/sklearn/compose/tests/test_resampled.py b/sklearn/compose/tests/test_resampled.py index fa488891f4ac3..8c93bcd1ac6ab 100644 --- a/sklearn/compose/tests/test_resampled.py +++ b/sklearn/compose/tests/test_resampled.py @@ -5,15 +5,14 @@ from sklearn.pipeline import Pipeline from sklearn.compose import ResampledTrainer from sklearn.utils.estimator_checks import check_estimator -from sklearn.utils.validation import _num_samples +from sklearn.utils.validation import _num_samples, check_X_y class HalfSampler(BaseEstimator): - "Train with every odd sample" + "Train with every second sample" def fit_resample(self, X, y, **kw): - if _num_samples(X) > 1 and getattr(X, 'format', None) not in ['dia', - 'coo', - 'bsr']: + X, y = check_X_y(X, y, accept_sparse='csr') + if _num_samples(X) > 1: return X[::2], y[::2] return X, y From 4fc58303dc7bcecd4f1d786aa55e433052045318 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 26 Jun 2019 18:48:26 +1000 Subject: [PATCH 27/46] Fix missing import --- sklearn/utils/estimator_checks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 2072150ac33b5..4d151f534edb9 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -46,6 +46,7 @@ from sklearn.pipeline import make_pipeline from sklearn.exceptions import DataConversionWarning from sklearn.exceptions import SkipTestWarning +from sklearn.exceptions import NotFittedError from sklearn.model_selection import train_test_split from sklearn.model_selection import ShuffleSplit from sklearn.model_selection._validation import _safe_split From 6a701513476ee97553c568ca617f9791617f21da Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 26 Jun 2019 18:49:04 +1000 Subject: [PATCH 28/46] Load data in doctest --- sklearn/compose/_resampled.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index 7ddadfcd86a34..83bc29f67b3c4 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -33,6 +33,7 @@ class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): ... return X[::2], y[::2] >>> >>> est = ResampledTrainer(HalfSampler(), LogisticRegression()) + >>> X, y = load_iris(return_X_y=True) >>> est.fit(X, y) >>> est.predict(X[:2]) """ From d45004709d4016b9f22b46671625d7515881cdea Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 26 Jun 2019 18:50:24 +1000 Subject: [PATCH 29/46] Note authorship on ResampledTrainer Even if it has multiple concurrent inventors --- sklearn/compose/_resampled.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index 83bc29f67b3c4..5f1bcffaf8eeb 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -1,3 +1,5 @@ +# Author: Joel Nothman + from ..base import BaseEstimator, MetaEstimatorMixin, clone from ..utils.metaestimators import if_delegate_has_method From ada6bad9d62c5d06537e722b37d5bdc126dbc43f Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Wed, 26 Jun 2019 21:05:09 +1000 Subject: [PATCH 30/46] Doctest outputs --- sklearn/compose/_resampled.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index 5f1bcffaf8eeb..a90699d8ed229 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -37,7 +37,9 @@ class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): >>> est = ResampledTrainer(HalfSampler(), LogisticRegression()) >>> X, y = load_iris(return_X_y=True) >>> est.fit(X, y) + ResampledTrainer(estimator=LogisticRegression(), resampler=HalfSampler()) >>> est.predict(X[:2]) + array([0, 0]) """ def __init__(self, resampler, estimator): From e5bcb3434e0d4f04344b07a5962c38090f86242f Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Thu, 27 Jun 2019 01:17:51 +0200 Subject: [PATCH 31/46] Add ResampledTrainer to metaestimator tests and add test_correct_halfsampler --- sklearn/compose/_resampled.py | 38 ++++++++---- sklearn/compose/tests/test_resampled.py | 77 ++++++++++++++++++++----- sklearn/tests/test_metaestimators.py | 12 +++- 3 files changed, 100 insertions(+), 27 deletions(-) diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index a90699d8ed229..6373cb509b975 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -2,6 +2,7 @@ from ..base import BaseEstimator, MetaEstimatorMixin, clone from ..utils.metaestimators import if_delegate_has_method +from ..utils.validation import check_is_fitted class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): @@ -12,6 +13,7 @@ class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): Parameters ---------- resampler : Estimator supporting fit_resample + estimator : Estimator Attributes @@ -46,7 +48,7 @@ def __init__(self, resampler, estimator): self.resampler = resampler self.estimator = estimator - _required_parameters = ['resampler', 'estimator'] + _required_parameters = ["resampler", "estimator"] # TODO: tags? @@ -61,41 +63,57 @@ def fit(self, X, y=None, **kw): self.estimator_ = clone(self.estimator).fit(X, y, **kw) return self - @if_delegate_has_method(delegate='estimator_') + @if_delegate_has_method(delegate="estimator") def predict(self, X, **predict_params): + check_is_fitted(self, "estimator_") return self.estimator_.predict(X, **predict_params) - @if_delegate_has_method(delegate='estimator_') + @if_delegate_has_method(delegate="estimator") + def transform(self, X): + check_is_fitted(self, "estimator_") + return self.estimator_.transform(X) + + @if_delegate_has_method(delegate="estimator") def predict_proba(self, X): + check_is_fitted(self, "estimator_") return self.estimator_.predict_proba(X) - @if_delegate_has_method(delegate='estimator_') + @if_delegate_has_method(delegate="estimator") def predict_log_proba(self, X): + check_is_fitted(self, "estimator_") return self.estimator_.predict_log_proba(X) - @if_delegate_has_method(delegate='estimator_') + @if_delegate_has_method(delegate="estimator") def decision_function(self, X): + check_is_fitted(self, "estimator_") return self.estimator_.decision_function(X) - @if_delegate_has_method(delegate='estimator_') + @if_delegate_has_method(delegate="estimator") def score(self, X, y, **kw): + check_is_fitted(self, "estimator_") return self.estimator_.score(X, y, **kw) @property def fit_transform(self): - transform = self.estimator_.transform + # check if the estimator has a transform function + transform = self.estimator.transform def fit_transform(X, y, **kwargs): self.fit(X, y, **kwargs) - return transform(X) + # since estimator_ exists now, we can return transform + return self.estimator_.transform(X) + + return fit_transform @property def fit_predict(self): - predict = self.estimator_.predict + predict = self.estimator.predict def fit_predict(X, y, **kwargs): self.fit(X, y, **kwargs) - return predict(X) + return self.estimator_.predict(X) + + return fit_predict @property def _estimator_type(self): diff --git a/sklearn/compose/tests/test_resampled.py b/sklearn/compose/tests/test_resampled.py index 8c93bcd1ac6ab..157a7e3041068 100644 --- a/sklearn/compose/tests/test_resampled.py +++ b/sklearn/compose/tests/test_resampled.py @@ -1,3 +1,6 @@ +# Authors: Joel Nothman +# Oliver Rausch +import numpy as np from sklearn.base import BaseEstimator from sklearn.datasets import make_classification from sklearn.svm import SVC, OneClassSVM @@ -10,29 +13,65 @@ class HalfSampler(BaseEstimator): "Train with every second sample" + def fit_resample(self, X, y, **kw): - X, y = check_X_y(X, y, accept_sparse='csr') + X, y = check_X_y(X, y, accept_sparse="csr") if _num_samples(X) > 1: return X[::2], y[::2] return X, y +class DataSaver(BaseEstimator): + "remembers the data that it was fitted with" + + def fit(self, X, y): + self.X = X + self.y = y + return self + + def predict(self, X): + return np.zeros((X.shape[0],)) + + def transform(self, X): + return np.zeros((X.shape[0],)) + + def test_estimator_checks(): check_estimator(ResampledTrainer(HalfSampler(), SVC())) +def test_correct_halfsampler(): + # check that the estimator is fitted with the correct data + X = np.zeros((10, 2)) + y = np.arange(10) + + rt = ResampledTrainer(HalfSampler(), DataSaver()) + for method in [rt.fit, rt.fit_transform, rt.fit_predict]: + method(X, y) + + np.testing.assert_array_equal(rt.estimator_.y, np.array([0, 2, 4, 6, 8])) + + def test_pca_outlier_svm(): # Test the various methods of the pipeline (pca + svm). - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0, + ) # Test with PCA + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) + clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() - outlier = OneClassSVM(gamma='scale') - pipe = Pipeline([('pca', pca), ('clf', ResampledTrainer(outlier, clf))]) + outlier = OneClassSVM(gamma="scale") + pipe = Pipeline([("pca", pca), ("clf", ResampledTrainer(outlier, clf))]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) @@ -42,16 +81,24 @@ def test_pca_outlier_svm(): def test_outlier_pca_svm(): # Test the various methods of the pipeline (pca + svm). - X, y = make_classification(n_classes=2, class_sep=2, weights=[0.1, 0.9], - n_informative=3, n_redundant=1, flip_y=0, - n_features=20, n_clusters_per_class=1, - n_samples=500, random_state=0) + X, y = make_classification( + n_classes=2, + class_sep=2, + weights=[0.1, 0.9], + n_informative=3, + n_redundant=1, + flip_y=0, + n_features=20, + n_clusters_per_class=1, + n_samples=500, + random_state=0, + ) # Test with PCA + SVC - clf = SVC(gamma='scale', probability=True, random_state=0) + clf = SVC(gamma="scale", probability=True, random_state=0) pca = PCA() - outlier = OneClassSVM(gamma='scale') - pipe = ResampledTrainer(outlier, Pipeline([('pca', pca), ('svc', clf)])) + outlier = OneClassSVM(gamma="scale") + pipe = ResampledTrainer(outlier, Pipeline([("pca", pca), ("svc", clf)])) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index 822dd0edb5501..9bb498e12539a 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -13,6 +13,7 @@ from sklearn.feature_selection import RFE, RFECV from sklearn.ensemble import BaggingClassifier from sklearn.exceptions import NotFittedError +from sklearn.compose import ResampledTrainer class DelegatorData: @@ -23,6 +24,10 @@ def __init__(self, name, construct, skip_methods=(), self.fit_args = fit_args self.skip_methods = skip_methods +class DummyResampler(BaseEstimator): + "does nothing" + def fit_resample(self, X, y): + return X, y DELEGATING_METAESTIMATORS = [ DelegatorData('Pipeline', lambda est: Pipeline([('est', est)])), @@ -41,7 +46,10 @@ def __init__(self, name, construct, skip_methods=(), DelegatorData('BaggingClassifier', BaggingClassifier, skip_methods=['transform', 'inverse_transform', 'score', 'predict_proba', 'predict_log_proba', - 'predict']) + 'predict']), + DelegatorData('ResampledTrainer', + lambda est: ResampledTrainer(DummyResampler(), est), + skip_methods=['inverse_transform']) ] @@ -62,7 +70,7 @@ def __init__(self, param=1, hidden_method=None): def fit(self, X, y=None, *args, **kwargs): self.coef_ = np.arange(X.shape[1]) - return True + return self def _check_fit(self): check_is_fitted(self, 'coef_') From fbdb966d26e9c9a427735ce8fbb36a8bb275929a Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Thu, 27 Jun 2019 01:36:08 +0200 Subject: [PATCH 32/46] Formatting --- doc/modules/compose.rst | 6 +- sklearn/compose/_resampled.py | 4 +- sklearn/tests/test_metaestimators.py | 2 + sklearn/utils/estimator_checks.py | 90 ++++++++++++++-------------- 4 files changed, 52 insertions(+), 50 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 98c94df5b8410..2ae537bb6cc1b 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -258,8 +258,10 @@ from the dataset if they classified as outliers. Consider the following:: >>> from sklearn.compose import ResampledTrainer >>> from sklearn.covariance import EllipticEnvelope >>> from sklearn.linear_model import LogisticRegression - >>> pipe = ResampledTrainer(EllipticEnvelope(), LogisticRegression()) - >>> pipe.fit(X_train, y_train) + >>> resampled = ResampledTrainer(EllipticEnvelope(), LogisticRegression()) + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> resampled.fit(X, y) In ``pipe``, we remove outliers before fitting our `LogisticRegression` model, so that the samples passed to fit come from the same distribution. We do diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index 6373cb509b975..cd87b429766e3 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -96,7 +96,7 @@ def score(self, X, y, **kw): @property def fit_transform(self): # check if the estimator has a transform function - transform = self.estimator.transform + self.estimator.transform def fit_transform(X, y, **kwargs): self.fit(X, y, **kwargs) @@ -107,7 +107,7 @@ def fit_transform(X, y, **kwargs): @property def fit_predict(self): - predict = self.estimator.predict + self.estimator.predict def fit_predict(X, y, **kwargs): self.fit(X, y, **kwargs) diff --git a/sklearn/tests/test_metaestimators.py b/sklearn/tests/test_metaestimators.py index 9bb498e12539a..45fd68709a3ab 100644 --- a/sklearn/tests/test_metaestimators.py +++ b/sklearn/tests/test_metaestimators.py @@ -24,11 +24,13 @@ def __init__(self, name, construct, skip_methods=(), self.fit_args = fit_args self.skip_methods = skip_methods + class DummyResampler(BaseEstimator): "does nothing" def fit_resample(self, X, y): return X, y + DELEGATING_METAESTIMATORS = [ DelegatorData('Pipeline', lambda est: Pipeline([('est', est)])), DelegatorData('GridSearchCV', diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 4d151f534edb9..04f815f7a7342 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -12,52 +12,50 @@ from scipy.stats import rankdata import joblib -from sklearn.utils import IS_PYPY -from sklearn.utils.testing import assert_raises, _get_args -from sklearn.utils.testing import assert_raises_regex -from sklearn.utils.testing import assert_raise_message -from sklearn.utils.testing import assert_equal -from sklearn.utils.testing import assert_not_equal -from sklearn.utils.testing import assert_in -from sklearn.utils.testing import assert_array_equal -from sklearn.utils.testing import assert_array_almost_equal -from sklearn.utils.testing import assert_allclose -from sklearn.utils.testing import assert_allclose_dense_sparse -from sklearn.utils.testing import assert_warns_message -from sklearn.utils.testing import set_random_state -from sklearn.utils.testing import assert_greater -from sklearn.utils.testing import assert_greater_equal -from sklearn.utils.testing import SkipTest -from sklearn.utils.testing import ignore_warnings -from sklearn.utils.testing import assert_dict_equal -from sklearn.utils.testing import create_memmap_backed_data -from sklearn.utils import is_scalar_nan -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis -from sklearn.linear_model import Ridge - - -from sklearn.base import (clone, ClusterMixin, is_classifier, is_regressor, - _DEFAULT_TAGS, RegressorMixin, is_outlier_detector) - -from sklearn.metrics import accuracy_score, adjusted_rand_score, f1_score - -from sklearn.random_projection import BaseRandomProjection -from sklearn.feature_selection import SelectKBest -from sklearn.pipeline import make_pipeline -from sklearn.exceptions import DataConversionWarning -from sklearn.exceptions import SkipTestWarning -from sklearn.exceptions import NotFittedError -from sklearn.model_selection import train_test_split -from sklearn.model_selection import ShuffleSplit -from sklearn.model_selection._validation import _safe_split -from sklearn.metrics.pairwise import (rbf_kernel, linear_kernel, - pairwise_distances) - -from sklearn.utils import shuffle -from sklearn.utils.validation import has_fit_parameter, _num_samples -from sklearn.preprocessing import StandardScaler -from sklearn.datasets import load_iris, load_boston -from sklearn.datasets import make_blobs, make_classification +from . import IS_PYPY +from .testing import assert_raises, _get_args +from .testing import assert_raises_regex +from .testing import assert_raise_message +from .testing import assert_equal +from .testing import assert_not_equal +from .testing import assert_in +from .testing import assert_array_equal +from .testing import assert_array_almost_equal +from .testing import assert_allclose +from .testing import assert_allclose_dense_sparse +from .testing import assert_warns_message +from .testing import set_random_state +from .testing import assert_greater +from .testing import assert_greater_equal +from .testing import SkipTest +from .testing import ignore_warnings +from .testing import assert_dict_equal +from .testing import create_memmap_backed_data +from . import is_scalar_nan +from ..discriminant_analysis import LinearDiscriminantAnalysis +from ..linear_model import Ridge + + +from ..base import (clone, ClusterMixin, is_classifier, is_regressor, + _DEFAULT_TAGS, RegressorMixin, is_outlier_detector) + +from ..metrics import accuracy_score, adjusted_rand_score, f1_score + +from ..random_projection import BaseRandomProjection +from ..feature_selection import SelectKBest +from ..pipeline import make_pipeline +from ..exceptions import DataConversionWarning +from ..exceptions import NotFittedError +from ..exceptions import SkipTestWarning +from ..model_selection import train_test_split +from ..model_selection import ShuffleSplit +from ..model_selection._validation import _safe_split +from ..metrics.pairwise import (rbf_kernel, linear_kernel, pairwise_distances) + +from .import shuffle +from .validation import has_fit_parameter, _num_samples +from ..preprocessing import StandardScaler +from ..datasets import load_iris, load_boston, make_blobs, make_classification BOSTON = None From dc00dff22d22441128385dfad7cf9454e493358b Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Thu, 27 Jun 2019 01:44:04 +0200 Subject: [PATCH 33/46] pep --- sklearn/compose/tests/test_resampled.py | 4 +++- sklearn/utils/estimator_checks.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/compose/tests/test_resampled.py b/sklearn/compose/tests/test_resampled.py index 157a7e3041068..585fbc28c84ae 100644 --- a/sklearn/compose/tests/test_resampled.py +++ b/sklearn/compose/tests/test_resampled.py @@ -49,7 +49,9 @@ def test_correct_halfsampler(): for method in [rt.fit, rt.fit_transform, rt.fit_predict]: method(X, y) - np.testing.assert_array_equal(rt.estimator_.y, np.array([0, 2, 4, 6, 8])) + np.testing.assert_array_equal( + rt.estimator_.y, np.array([0, 2, 4, 6, 8]) + ) def test_pca_outlier_svm(): diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 9b56edd995bbc..a84a07924e3e1 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2528,7 +2528,8 @@ def check_fit_idempotent(name, estimator_orig): atol=max(tol, 1e-9), rtol=max(tol, 1e-7), err_msg="Idempotency check failed for method {}".format(method) ) - + + def check_outlier_rejectors(name, estimator_orig): X, y = make_blobs(random_state=0) outliers = estimator_orig.fit_predict(X, y) == -1 From c9ffe0ca78362554fb3da3e512bfa559d236e826 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Tue, 2 Jul 2019 22:46:34 +0200 Subject: [PATCH 34/46] Add NaNFilter --- doc/modules/compose.rst | 3 ++ sklearn/compose/_resampled.py | 3 +- sklearn/preprocessing/__init__.py | 2 + sklearn/preprocessing/data.py | 67 +++++++++++++++++++++++- sklearn/preprocessing/tests/test_data.py | 50 ++++++++++++++++++ 5 files changed, 123 insertions(+), 2 deletions(-) diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst index 2ae537bb6cc1b..2583b21716525 100644 --- a/doc/modules/compose.rst +++ b/doc/modules/compose.rst @@ -262,6 +262,9 @@ from the dataset if they classified as outliers. Consider the following:: >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> resampled.fit(X, y) + ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + ResampledTrainer(...) + In ``pipe``, we remove outliers before fitting our `LogisticRegression` model, so that the samples passed to fit come from the same distribution. We do diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index cd87b429766e3..1c13b69a3c08f 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -39,7 +39,8 @@ class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): >>> est = ResampledTrainer(HalfSampler(), LogisticRegression()) >>> X, y = load_iris(return_X_y=True) >>> est.fit(X, y) - ResampledTrainer(estimator=LogisticRegression(), resampler=HalfSampler()) + ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + ResampledTrainer(...) >>> est.predict(X[:2]) array([0, 0]) """ diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 2eb41a66220c7..e03f3d5c6f1e2 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -24,6 +24,7 @@ from .data import power_transform from .data import PowerTransformer from .data import PolynomialFeatures +from .data import NaNFilter from ._encoders import OneHotEncoder from ._encoders import OrdinalEncoder @@ -64,4 +65,5 @@ 'label_binarize', 'quantile_transform', 'power_transform', + 'NaNFilter', ] diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 823eedc8b7dd9..ee4b2dbf266cc 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -20,7 +20,7 @@ from scipy.special import boxcox from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array +from ..utils import check_array, check_X_y, safe_indexing from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, @@ -57,6 +57,7 @@ 'minmax_scale', 'quantile_transform', 'power_transform', + 'NaNFilter', ] @@ -3009,3 +3010,67 @@ def power_transform(X, method='warn', standardize=True, copy=True): method = 'box-cox' pt = PowerTransformer(method=method, standardize=standardize, copy=copy) return pt.fit_transform(X) + +class NaNFilter(BaseEstimator): + """ + A resampler that removes samples containing NaN in X. + + Parameters + ---------- + count : int, optional, default=1 + The amount of NaNs a sample can contain before it gets filtered. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import NaNFilter + >>> nan = float('nan') + >>> data = [[1, 2], [1, nan], [nan, nan]] + >>> y = [0, 0, 0] + >>> Xr, yr, _ = NaNFilter().fit_resample(data)) + >>> print(Xr) + [[1. 2.]] + >>> Xr, yr, _ = NaNFilter(count=2).fit_resample(data)) + >>> print(Xr) + [[ 1. 2.] + [ 1. nan]] + + See also + -------- + :ref:`Imputation ` : removing NaNs by replacing them with values. + """ + def __init__(self, count=1): + self.count = count + + def fit_resample(self, X, y, **kws): + """Removes samples containing NaN from X. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Input data X. + + y : ndarray, shape (n_samples,) + Input data y. + + Returns + ------- + X : ndarray, shape (n_samples, n_features) + The input X, with all samples containing more than `count` NaN + values removed. + + y : ndarray, shape (n_samples,) + The input y, with all samples containing more than `count` NaN + values removed. + + kws : dict of ndarray + dict of keyword arguments, with all samples containing more than + `count` NaN values removed. + """ + X, y = check_X_y(X, y, force_all_finite='allow-nan') + mask = np.sum(np.isnan(X), axis=1) < self.count + kwsr = { + kw: safe_indexing(kws[kw], np.where(mask)[0]) + for kw in kws + } + return safe_indexing(X, mask), safe_indexing(y, mask), kwsr diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index b49396c7c0253..e0156ed2252f7 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -53,6 +53,7 @@ from sklearn.preprocessing.data import PowerTransformer from sklearn.preprocessing.data import power_transform from sklearn.preprocessing.data import BOUNDS_THRESHOLD +from sklearn.preprocessing.data import NaNFilter from sklearn.exceptions import NotFittedError from sklearn.base import clone @@ -2466,3 +2467,52 @@ def test_power_transform_default_method(): X_trans_boxcox = power_transform(X, method='box-cox') assert_array_equal(X_trans_boxcox, X_trans_default) + +def test_nanfilter(): + nan = float('nan') + data = [[1, 2], [1, nan], [nan, nan]] + y = [0, 1, 2] + sample_weights = np.array([0.1, 0.4, 0.5]) + other_sample_prop = [0.3, 0.4, 0.5] + Xr, yr, kws = NaNFilter().fit_resample( + data, y) + + assert_array_equal( + Xr, + np.array([[1, 2]]) + ) + assert_array_equal( + yr, + np.array([0]) + ) + + Xr, yr, kws = NaNFilter(count=2).fit_resample( + data, y) + assert_array_equal( + Xr, + np.array([[1, 2], [1, nan]]) + ) + assert_array_equal( + yr, + np.array([0, 1]) + ) + + Xr, yr, kws = NaNFilter().fit_resample( + data, y, + sample_weights=sample_weights, + other_sample_prop=other_sample_prop + ) + + assert_array_equal( + Xr, + np.array([[1, 2]]) + ) + + assert_array_equal( + kws['sample_weights'], + np.array([0.1]) + ) + assert_array_equal( + kws['other_sample_prop'], + np.array([0.3]) + ) From 9cf2a9c2d338794599d0b0cb2a0f10c7ebc1a93a Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 00:39:23 +0200 Subject: [PATCH 35/46] Handle kwargs, go through all estimator_checks and fix failures due to missing fit. --- sklearn/base.py | 20 +- sklearn/neighbors/lof.py | 20 +- sklearn/preprocessing/data.py | 12 +- sklearn/preprocessing/tests/test_data.py | 1 + sklearn/utils/estimator_checks.py | 539 +++++++++++++---------- 5 files changed, 358 insertions(+), 234 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 339c3a6768375..01fdc277f8b2e 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -14,7 +14,7 @@ from . import __version__ from sklearn.utils import _IS_32BIT -from sklearn.utils import safe_indexing +from sklearn.utils import safe_indexing, check_X_y _DEFAULT_TAGS = { 'non_deterministic': False, @@ -610,7 +610,7 @@ class OutlierRejectionMixin: """ _estimator_type = "outlier_rejector" - def fit_resample(self, X, y): + def fit_resample(self, X, y, **kws): """Performs fit on X and returns a new X and y consisting of only the inliers. @@ -629,11 +629,23 @@ def fit_resample(self, X, y): y : ndarray, shape (n_samples,) The original y with outlier samples removed. + + kws : dict of ndarray + dict of keyword arguments, with all outlier samples removed. """ + check_X_y(X, y) + # NOTE this is probably not the best way to do this + kws = { + kw: check_X_y(X, kws[kw], force_all_finite='allow-nan')[1] + for kw in kws + } inliers = self.fit_predict(X) == 1 - - return safe_indexing(X, inliers), safe_indexing(y, inliers) + kwsr = { + kw: safe_indexing(kws[kw], inliers) + for kw in kws + } + return safe_indexing(X, inliers), safe_indexing(y, inliers), kwsr class MetaEstimatorMixin: diff --git a/sklearn/neighbors/lof.py b/sklearn/neighbors/lof.py index 89755973357c7..36919d182604c 100644 --- a/sklearn/neighbors/lof.py +++ b/sklearn/neighbors/lof.py @@ -12,7 +12,7 @@ from ..base import OutlierRejectionMixin from ..utils.validation import check_is_fitted -from ..utils import check_array +from ..utils import check_array, safe_indexing, check_X_y __all__ = ["LocalOutlierFactor"] @@ -214,6 +214,9 @@ def fit_resample(self): y : ndarray, shape (n_samples,) The input y with outlier samples removed. + + kws : dict of ndarray + dict of keyword arguments, with all outlier samples removed. """ # fit_resample requires fit_predict if self.novelty: @@ -223,11 +226,18 @@ def fit_resample(self): return self._fit_resample - def _fit_resample(self, X, y=None): - # XXX this is not very clean, is there a better way? + def _fit_resample(self, X, y, **kws): + check_X_y(X, y) + kws = { + kw: check_X_y(X, kws[kw], force_all_finite='allow-nan')[1] + for kw in kws + } inliers = self.fit_predict(X) == 1 - - return X[inliers], y[inliers] + kwsr = { + kw: safe_indexing(kws[kw], inliers) + for kw in kws + } + return safe_indexing(X, inliers), safe_indexing(y, inliers), kwsr def _fit_predict(self, X, y=None): """"Fits the model to the training set X and returns the labels. diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index ee4b2dbf266cc..19e0a5509e4d1 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -3011,6 +3011,7 @@ def power_transform(X, method='warn', standardize=True, copy=True): pt = PowerTransformer(method=method, standardize=standardize, copy=copy) return pt.fit_transform(X) + class NaNFilter(BaseEstimator): """ A resampler that removes samples containing NaN in X. @@ -3068,9 +3069,18 @@ def fit_resample(self, X, y, **kws): `count` NaN values removed. """ X, y = check_X_y(X, y, force_all_finite='allow-nan') + + # NOTE this is probably not the best way to do this + kws = { + kw: check_X_y(X, kws[kw], force_all_finite='allow-nan')[1] + for kw in kws + } mask = np.sum(np.isnan(X), axis=1) < self.count kwsr = { - kw: safe_indexing(kws[kw], np.where(mask)[0]) + kw: safe_indexing(kws[kw], mask) for kw in kws } return safe_indexing(X, mask), safe_indexing(y, mask), kwsr + + def _more_tags(self): + return {'allow_nan': True} diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index e0156ed2252f7..5a33396416696 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2468,6 +2468,7 @@ def test_power_transform_default_method(): X_trans_boxcox = power_transform(X, method='box-cox') assert_array_equal(X_trans_boxcox, X_trans_default) + def test_nanfilter(): nan = float('nan') data = [[1, 2], [1, nan], [nan, nan]] diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index a84a07924e3e1..add7d9960552b 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -80,11 +80,12 @@ def _yield_checks(name, estimator): tags = _safe_tags(estimator) yield check_estimators_dtypes yield check_fit_score_takes_y - yield check_sample_weights_pandas_series - yield check_sample_weights_list - yield check_sample_weights_invariance - yield check_estimators_fit_returns_self - yield partial(check_estimators_fit_returns_self, readonly_memmap=True) + if hasattr(estimator, 'fit'): + yield check_sample_weights_pandas_series + yield check_sample_weights_list + yield check_sample_weights_invariance + yield check_estimators_fit_returns_self + yield partial(check_estimators_fit_returns_self, readonly_memmap=True) # Check that all estimator yield informative messages when # trained on empty datasets @@ -93,7 +94,11 @@ def _yield_checks(name, estimator): yield check_dtype_object yield check_estimators_empty_data_messages - if name not in CROSS_DECOMPOSITION: + if (name not in CROSS_DECOMPOSITION + and not hasattr(estimator, 'fit_resample')): + # TODO potentially readd fit_resample tests after SLEP has + # been clarified. + # cross-decomposition's "transform" returns X and Y yield check_pipeline_consistency @@ -107,9 +112,10 @@ def _yield_checks(name, estimator): yield check_estimator_sparse_data - # Test that estimators can be pickled, and once pickled - # give the same answer as before. - yield check_estimators_pickle + if hasattr(estimator, 'fit'): + # Test that estimators can be pickled, and once pickled + # give the same answer as before. + yield check_estimators_pickle def _yield_classifier_checks(name, classifier): @@ -235,6 +241,7 @@ def _yield_resamplers_checks(name, estimator): yield check_resampler_structure yield check_resamplers_have_no_transform yield check_resample_repeated + yield check_fit_resample2d def _yield_all_checks(name, estimator): @@ -270,16 +277,20 @@ def _yield_all_checks(name, estimator): if is_outlier_detector(estimator): for check in _yield_outliers_checks(name, estimator): yield check - yield check_fit2d_predict1d - yield check_methods_subset_invariance + if hasattr(estimator, 'fit'): + yield check_fit2d_predict1d + yield check_methods_subset_invariance yield check_fit2d_1sample yield check_fit2d_1feature yield check_fit1d yield check_get_params_invariance yield check_set_params - yield check_dict_unchanged + + if hasattr(estimator, 'fit'): + yield check_dict_unchanged yield check_dont_overwrite_parameters - yield check_fit_idempotent + if hasattr(estimator, 'fit'): + yield check_fit_idempotent def check_estimator(Estimator): @@ -531,7 +542,10 @@ def check_estimator_sparse_data(name, estimator_orig): # fit and predict try: with ignore_warnings(category=(DeprecationWarning, FutureWarning)): - estimator.fit(X, y) + if hasattr(estimator, "fit"): + estimator.fit(X, y) + if hasattr(estimator, "fit_resample"): + estimator.fit_resample(X, y) if hasattr(estimator, "predict"): pred = estimator.predict(X) if tags['multioutput_only']: @@ -651,50 +665,58 @@ def check_sample_weights_invariance(name, estimator_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible - rng = np.random.RandomState(0) - X = pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig) - X = X.astype(object) - tags = _safe_tags(estimator_orig) - if tags['binary_only']: - y = (X[:, 0] * 2).astype(np.int) - else: - y = (X[:, 0] * 4).astype(np.int) - estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) - - estimator.fit(X, y) - if hasattr(estimator, "predict"): - estimator.predict(X) + methods = ['fit', 'fit_resample', 'fit_transform'] + methods = filter(lambda method: hasattr(estimator_orig, method), methods) + for method in methods: + rng = np.random.RandomState(0) + X = pairwise_estimator_convert_X(rng.rand(40, 10), estimator_orig) + X = X.astype(object) + tags = _safe_tags(estimator_orig) + if tags['binary_only']: + y = (X[:, 0] * 2).astype(np.int) + else: + y = (X[:, 0] * 4).astype(np.int) + estimator = clone(estimator_orig) + y = enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "transform"): - estimator.transform(X) + getattr(estimator, method)(X, y) + if hasattr(estimator, "predict"): + estimator.predict(X) - try: - estimator.fit(X, y.astype(object)) - except Exception as e: - if "Unknown label type" not in str(e): - raise + if hasattr(estimator, "transform"): + estimator.transform(X) - if 'string' not in tags['X_types']: - X[0, 0] = {'foo': 'bar'} - msg = "argument must be a string.* number" - assert_raises_regex(TypeError, msg, estimator.fit, X, y) - else: - # Estimators supporting string will not call np.asarray to convert the - # data to numeric and therefore, the error will not be raised. - # Checking for each element dtype in the input array will be costly. - # Refer to #11401 for full discussion. - estimator.fit(X, y) + try: + getattr(estimator, method)(X, y.astype(object)) + except Exception as e: + if "Unknown label type" not in str(e): + raise + + if 'string' not in tags['X_types']: + X[0, 0] = {'foo': 'bar'} + msg = "argument must be a string.* number" + assert_raises_regex( + TypeError, msg, getattr(estimator, method), X, y) + else: + # Estimators supporting string will not call np.asarray to convert + # the data to numeric and therefore, the error will not be raised. + # Checking for each element dtype in the input array will be + # costly. + # Refer to #11401 for full discussion. + getattr(estimator, method)(X, y) def check_complex_data(name, estimator_orig): # check that estimators raise an exception on providing complex data - X = np.random.sample(10) + 1j * np.random.sample(10) - X = X.reshape(-1, 1) - y = np.random.sample(10) + 1j * np.random.sample(10) - estimator = clone(estimator_orig) - assert_raises_regex(ValueError, "Complex data not supported", - estimator.fit, X, y) + methods = ['fit', 'fit_resample', 'fit_transform'] + methods = filter(lambda method: hasattr(estimator_orig, method), methods) + for method in methods: + X = np.random.sample(10) + 1j * np.random.sample(10) + X = X.reshape(-1, 1) + y = np.random.sample(10) + 1j * np.random.sample(10) + estimator = clone(estimator_orig) + assert_raises_regex(ValueError, "Complex data not supported", + getattr(estimator, method), X, y) @ignore_warnings @@ -737,63 +759,69 @@ def check_dict_unchanged(name, estimator_orig): 'Estimator changes __dict__ during %s' % method) + def is_public_parameter(attr): return not (attr.startswith('_') or attr.endswith('_')) @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_dont_overwrite_parameters(name, estimator_orig): - # check that fit method only changes or sets private attributes - if hasattr(estimator_orig.__init__, "deprecated_original"): - # to not check deprecated classes - return - estimator = clone(estimator_orig) - rnd = np.random.RandomState(0) - X = 3 * rnd.uniform(size=(20, 3)) - X = pairwise_estimator_convert_X(X, estimator_orig) - y = X[:, 0].astype(np.int) - if _safe_tags(estimator, 'binary_only'): - y[y == 2] = 1 - y = enforce_estimator_tags_y(estimator, y) + # check that fit methods only change or set private attributes + methods = ['fit', 'fit_resample', 'fit_transform', 'fit_predict'] + methods = filter(lambda method: hasattr(estimator_orig, method), methods) + for method in methods: + if hasattr(estimator_orig.__init__, "deprecated_original"): + # to not check deprecated classes + return + estimator = clone(estimator_orig) + rnd = np.random.RandomState(0) + X = 3 * rnd.uniform(size=(20, 3)) + X = pairwise_estimator_convert_X(X, estimator_orig) + y = X[:, 0].astype(np.int) + if _safe_tags(estimator, 'binary_only'): + y[y == 2] = 1 + y = enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 + if hasattr(estimator, "n_components"): + estimator.n_components = 1 + if hasattr(estimator, "n_clusters"): + estimator.n_clusters = 1 - set_random_state(estimator, 1) - dict_before_fit = estimator.__dict__.copy() - estimator.fit(X, y) + set_random_state(estimator, 1) + dict_before_fit = estimator.__dict__.copy() + + method = getattr(estimator, method) + method(X, y) - dict_after_fit = estimator.__dict__ + dict_after_fit = estimator.__dict__ - public_keys_after_fit = [key for key in dict_after_fit.keys() - if is_public_parameter(key)] + public_keys_after_fit = [key for key in dict_after_fit.keys() + if is_public_parameter(key)] - attrs_added_by_fit = [key for key in public_keys_after_fit - if key not in dict_before_fit.keys()] + attrs_added_by_fit = [key for key in public_keys_after_fit + if key not in dict_before_fit.keys()] - # check that fit doesn't add any public attribute - assert not attrs_added_by_fit, ( - 'Estimator adds public attribute(s) during' - ' the fit method.' - ' Estimators are only allowed to add private attributes' - ' either started with _ or ended' - ' with _ but %s added' - % ', '.join(attrs_added_by_fit)) + # check that fit doesn't add any public attribute + assert not attrs_added_by_fit, ( + 'Estimator adds public attribute(s) during' + ' the fit method.' + ' Estimators are only allowed to add private attributes' + ' either started with _ or ended' + ' with _ but %s added' + % ', '.join(attrs_added_by_fit)) - # check that fit doesn't change any public attribute - attrs_changed_by_fit = [key for key in public_keys_after_fit - if (dict_before_fit[key] - is not dict_after_fit[key])] + # check that fit doesn't change any public attribute + attrs_changed_by_fit = [key for key in public_keys_after_fit + if (dict_before_fit[key] + is not dict_after_fit[key])] - assert not attrs_changed_by_fit, ( - 'Estimator changes public attribute(s) during' - ' the fit method. Estimators are only allowed' - ' to change attributes started' - ' or ended with _, but' - ' %s changed' - % ', '.join(attrs_changed_by_fit)) + assert not attrs_changed_by_fit, ( + 'Estimator changes public attribute(s) during' + ' the fit method. Estimators are only allowed' + ' to change attributes started' + ' or ended with _, but' + ' %s changed' + % ', '.join(attrs_changed_by_fit)) @ignore_warnings(category=(DeprecationWarning, FutureWarning)) @@ -826,6 +854,25 @@ def check_fit2d_predict1d(name, estimator_orig): assert_raise_message(ValueError, "Reshape your data", getattr(estimator, method), X[0]) +def check_fit_resample2d(name, estimator_orig): + # check by fit resampling a 2d array + rnd = np.random.RandomState(0) + X = 3 * rnd.uniform(size=(20, 3)) + X = pairwise_estimator_convert_X(X, estimator_orig) + y = X[:, 0].astype(np.int) + tags = _safe_tags(estimator_orig) + if tags['binary_only']: + y[y == 2] = 1 + estimator = clone(estimator_orig) + y = enforce_estimator_tags_y(estimator, y) + + if hasattr(estimator, "n_components"): + estimator.n_components = 1 + if hasattr(estimator, "n_clusters"): + estimator.n_clusters = 1 + + set_random_state(estimator, 1) + X, y, kw = estimator.fit_resample(X, y) def _apply_on_subsets(func, X): # apply function on the whole set and on mini batches @@ -890,87 +937,96 @@ def check_fit2d_1sample(name, estimator_orig): # Check that fitting a 2d array with only one sample either works or # returns an informative message. The error message should either mention # the number of samples or the number of classes. - rnd = np.random.RandomState(0) - X = 3 * rnd.uniform(size=(1, 10)) - y = X[:, 0].astype(np.int) - estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + methods = ['fit', 'fit_resample', 'fit_transform', 'fit_predict'] + methods = filter(lambda method: hasattr(estimator_orig, method), methods) + for method in methods: + rnd = np.random.RandomState(0) + X = 3 * rnd.uniform(size=(1, 10)) + y = X[:, 0].astype(np.int) + estimator = clone(estimator_orig) + y = enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 + if hasattr(estimator, "n_components"): + estimator.n_components = 1 + if hasattr(estimator, "n_clusters"): + estimator.n_clusters = 1 - set_random_state(estimator, 1) + set_random_state(estimator, 1) - # min_cluster_size cannot be less than the data size for OPTICS. - if name == 'OPTICS': - estimator.set_params(min_samples=1) + # min_cluster_size cannot be less than the data size for OPTICS. + if name == 'OPTICS': + estimator.set_params(min_samples=1) - msgs = ["1 sample", "n_samples = 1", "n_samples=1", "one sample", - "1 class", "one class"] + msgs = ["1 sample", "n_samples = 1", "n_samples=1", "one sample", + "1 class", "one class"] - try: - estimator.fit(X, y) - except ValueError as e: - if all(msg not in repr(e) for msg in msgs): - raise e + try: + getattr(estimator, method)(X, y) + except ValueError as e: + if all(msg not in repr(e) for msg in msgs): + raise e @ignore_warnings def check_fit2d_1feature(name, estimator_orig): # check fitting a 2d array with only 1 feature either works or returns # informative message - rnd = np.random.RandomState(0) - X = 3 * rnd.uniform(size=(10, 1)) - X = pairwise_estimator_convert_X(X, estimator_orig) - y = X[:, 0].astype(np.int) - estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + methods = ['fit', 'fit_resample', 'fit_transform', 'fit_predict'] + methods = filter(lambda method: hasattr(estimator_orig, method), methods) + for method in methods: + rnd = np.random.RandomState(0) + X = 3 * rnd.uniform(size=(10, 1)) + X = pairwise_estimator_convert_X(X, estimator_orig) + y = X[:, 0].astype(np.int) + estimator = clone(estimator_orig) + y = enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 - # ensure two labels in subsample for RandomizedLogisticRegression - if name == 'RandomizedLogisticRegression': - estimator.sample_fraction = 1 - # ensure non skipped trials for RANSACRegressor - if name == 'RANSACRegressor': - estimator.residual_threshold = 0.5 + if hasattr(estimator, "n_components"): + estimator.n_components = 1 + if hasattr(estimator, "n_clusters"): + estimator.n_clusters = 1 + # ensure two labels in subsample for RandomizedLogisticRegression + if name == 'RandomizedLogisticRegression': + estimator.sample_fraction = 1 + # ensure non skipped trials for RANSACRegressor + if name == 'RANSACRegressor': + estimator.residual_threshold = 0.5 - y = enforce_estimator_tags_y(estimator, y) - set_random_state(estimator, 1) + y = enforce_estimator_tags_y(estimator, y) + set_random_state(estimator, 1) - msgs = ["1 feature(s)", "n_features = 1", "n_features=1"] + msgs = ["1 feature(s)", "n_features = 1", "n_features=1"] - try: - estimator.fit(X, y) - except ValueError as e: - if all(msg not in repr(e) for msg in msgs): - raise e + try: + getattr(estimator, method)(X, y) + except ValueError as e: + if all(msg not in repr(e) for msg in msgs): + raise e @ignore_warnings def check_fit1d(name, estimator_orig): # check fitting 1d X array raises a ValueError - rnd = np.random.RandomState(0) - X = 3 * rnd.uniform(size=(20)) - y = X.astype(np.int) - estimator = clone(estimator_orig) - tags = _safe_tags(estimator) - if tags["no_validation"]: - # FIXME this is a bit loose - return - y = enforce_estimator_tags_y(estimator, y) + methods = ['fit', 'fit_resample', 'fit_transform', 'fit_predict'] + methods = filter(lambda method: hasattr(estimator_orig, method), methods) + for method in methods: + rnd = np.random.RandomState(0) + X = 3 * rnd.uniform(size=(20)) + y = X.astype(np.int) + estimator = clone(estimator_orig) + tags = _safe_tags(estimator) + if tags["no_validation"]: + # FIXME this is a bit loose + return + y = enforce_estimator_tags_y(estimator, y) - if hasattr(estimator, "n_components"): - estimator.n_components = 1 - if hasattr(estimator, "n_clusters"): - estimator.n_clusters = 1 + if hasattr(estimator, "n_components"): + estimator.n_components = 1 + if hasattr(estimator, "n_clusters"): + estimator.n_clusters = 1 - set_random_state(estimator, 1) - assert_raises(ValueError, estimator.fit, X, y) + set_random_state(estimator, 1) + assert_raises(ValueError, getattr(estimator, method), X, y) @ignore_warnings(category=(DeprecationWarning, FutureWarning)) @@ -1164,34 +1220,43 @@ def check_estimators_dtypes(name, estimator_orig): for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]: estimator = clone(estimator_orig) set_random_state(estimator, 1) - estimator.fit(X_train, y) + if hasattr(estimator, "fit"): + estimator.fit(X_train, y) for method in methods: if hasattr(estimator, method): getattr(estimator, method)(X_train) + if hasattr(estimator, "fit_resample"): + getattr(estimator, "fit_resample")(X_train, y) + @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_estimators_empty_data_messages(name, estimator_orig): - e = clone(estimator_orig) - set_random_state(e, 1) - - X_zero_samples = np.empty(0).reshape(0, 3) - # The precise message can change depending on whether X or y is - # validated first. Let us test the type of exception only: - with assert_raises(ValueError, msg="The estimator {} does not" - " raise an error when an empty data is used " - "to train. Perhaps use " - "check_array in train.".format(name)): - e.fit(X_zero_samples, []) - - X_zero_features = np.empty(0).reshape(3, 0) - # the following y should be accepted by both classifiers and regressors - # and ignored by unsupervised models - y = enforce_estimator_tags_y(e, np.array([1, 0, 1])) - msg = (r"0 feature\(s\) \(shape=\(\d*, 0\)\) while a minimum of \d* " - "is required.") - assert_raises_regex(ValueError, msg, e.fit, X_zero_features, y) + methods = ['fit', 'fit_resample', 'fit_transform'] + methods = filter(lambda method: hasattr(estimator_orig, method), methods) + for method in methods: + e = clone(estimator_orig) + set_random_state(e, 1) + + X_zero_samples = np.empty(0).reshape(0, 3) + # The precise message can change depending on whether X or y is + # validated first. Let us test the type of exception only: + with assert_raises(ValueError, msg="The estimator {} does not" + " raise an error when an empty data is used " + "to train. Perhaps use " + "check_array in train.".format(name)): + getattr(e, method)(X_zero_samples, []) + + X_zero_features = np.empty(0).reshape(3, 0) + # the following y should be accepted by both classifiers and regressors + # and ignored by unsupervised models + y = enforce_estimator_tags_y(e, np.array([1, 0, 1])) + msg = (r"0 feature\(s\) \(shape=\(\d*, 0\)\) while a minimum of \d* " + "is required.") + assert_raises_regex( + ValueError, msg, getattr(e, method), X_zero_features, y + ) @ignore_warnings(category=DeprecationWarning) @@ -1199,7 +1264,7 @@ def check_estimators_nan_inf(name, estimator_orig): # Checks that Estimator X's do not contain NaN or inf. rnd = np.random.RandomState(0) X_train_finite = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), - estimator_orig) + estimator_orig) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan X_train_inf = rnd.uniform(size=(10, 3)) @@ -1208,31 +1273,50 @@ def check_estimators_nan_inf(name, estimator_orig): y[:5] = 0 y = enforce_estimator_tags_y(estimator_orig, y) error_string_fit = "Estimator doesn't check for NaN and inf in fit." + error_string_fit_resample = ("Estimator doesn't check for NaN and inf in" + " fit_resample.") error_string_predict = ("Estimator doesn't check for NaN and inf in" " predict.") error_string_transform = ("Estimator doesn't check for NaN and inf in" - " transform.") + " transform.") for X_train in [X_train_nan, X_train_inf]: # catch deprecation warnings with ignore_warnings(category=(DeprecationWarning, FutureWarning)): estimator = clone(estimator_orig) set_random_state(estimator, 1) - # try to fit - try: - estimator.fit(X_train, y) - except ValueError as e: - if 'inf' not in repr(e) and 'NaN' not in repr(e): - print(error_string_fit, estimator, e) + + if hasattr(estimator, 'fit'): + # try to fit + try: + estimator.fit(X_train, y) + except ValueError as e: + if 'inf' not in repr(e) and 'NaN' not in repr(e): + print(error_string_fit, estimator, e) + traceback.print_exc(file=sys.stdout) + raise e + except Exception as exc: + print(error_string_fit, estimator, exc) traceback.print_exc(file=sys.stdout) - raise e - except Exception as exc: - print(error_string_fit, estimator, exc) - traceback.print_exc(file=sys.stdout) - raise exc - else: - raise AssertionError(error_string_fit, estimator) - # actually fit - estimator.fit(X_train_finite, y) + raise exc + else: + raise AssertionError(error_string_fit, estimator) + # actually fit + estimator.fit(X_train_finite, y) + + # fit_resample + if hasattr(estimator, "fit_resample"): + try: + estimator.fit_resample(X_train, y) + except ValueError as e: + if 'inf' not in repr(e) and 'NaN' not in repr(e): + print(error_string_predict, estimator, e) + traceback.print_exc(file=sys.stdout) + raise e + except Exception as exc: + print(error_string_predict, estimator, exc) + traceback.print_exc(file=sys.stdout) + else: + raise AssertionError(error_string_fit_resample, estimator) # predict if hasattr(estimator, "predict"): @@ -2037,41 +2121,44 @@ def check_class_weight_balanced_linear_classifier(name, Classifier): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_estimators_overwrite_params(name, estimator_orig): - if _safe_tags(estimator_orig, 'binary_only'): - n_centers = 2 - else: - n_centers = 3 - X, y = make_blobs(random_state=0, n_samples=9, centers=n_centers) - # some want non-negative input - X -= X.min() - X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) - estimator = clone(estimator_orig) - y = enforce_estimator_tags_y(estimator, y) + methods = ['fit', 'fit_resample', 'fit_transform'] + methods = filter(lambda method: hasattr(estimator_orig, method), methods) + for method in methods: + if _safe_tags(estimator_orig, 'binary_only'): + n_centers = 2 + else: + n_centers = 3 + X, y = make_blobs(random_state=0, n_samples=9, centers=n_centers) + # some want non-negative input + X -= X.min() + X = pairwise_estimator_convert_X(X, estimator_orig, kernel=rbf_kernel) + estimator = clone(estimator_orig) + y = enforce_estimator_tags_y(estimator, y) - set_random_state(estimator) + set_random_state(estimator) - # Make a physical copy of the original estimator parameters before fitting. - params = estimator.get_params() - original_params = deepcopy(params) + # Make a physical copy of the original estimator parameters before fitting. + params = estimator.get_params() + original_params = deepcopy(params) - # Fit the model - estimator.fit(X, y) + # Fit the model + getattr(estimator, method)(X, y) - # Compare the state of the model parameters with the original parameters - new_params = estimator.get_params() - for param_name, original_value in original_params.items(): - new_value = new_params[param_name] + # Compare the state of the model parameters with the original parameters + new_params = estimator.get_params() + for param_name, original_value in original_params.items(): + new_value = new_params[param_name] - # We should never change or mutate the internal state of input - # parameters by default. To check this we use the joblib.hash function - # that introspects recursively any subobjects to compute a checksum. - # The only exception to this rule of immutable constructor parameters - # is possible RandomState instance but in this check we explicitly - # fixed the random_state params recursively to be integer seeds. - assert_equal(joblib.hash(new_value), joblib.hash(original_value), - "Estimator %s should not change or mutate " - " the parameter %s from %s to %s during fit." - % (name, param_name, original_value, new_value)) + # We should never change or mutate the internal state of input + # parameters by default. To check this we use the joblib.hash function + # that introspects recursively any subobjects to compute a checksum. + # The only exception to this rule of immutable constructor parameters + # is possible RandomState instance but in this check we explicitly + # fixed the random_state params recursively to be integer seeds. + assert_equal(joblib.hash(new_value), joblib.hash(original_value), + "Estimator %s should not change or mutate " + " the parameter %s from %s to %s during fit." + % (name, param_name, original_value, new_value)) def check_no_attributes_set_in_init(name, estimator): @@ -2535,7 +2622,7 @@ def check_outlier_rejectors(name, estimator_orig): outliers = estimator_orig.fit_predict(X, y) == -1 n_outliers = np.sum(outliers) - X_new, y_new = estimator_orig.fit_resample(X, y) + X_new, y_new, kws = estimator_orig.fit_resample(X, y) assert X_new.shape[0] == X.shape[0] - n_outliers assert y_new.shape[0] == y.shape[0] - n_outliers @@ -2543,7 +2630,11 @@ def check_outlier_rejectors(name, estimator_orig): def check_resampler_structure(name, estimator_orig): X, y = make_blobs(n_samples=10) - X_new, y_new = estimator_orig.fit_resample(X, y) + X_new, y_new, kw = estimator_orig.fit_resample(X, y) + + +def check_resample_fails_on_non_matching_shapes(): + pass def check_resample_repeated(name, estimator_orig): @@ -2556,9 +2647,9 @@ def check_resample_repeated(name, estimator_orig): random_state=0) set_random_state(estimator_orig, random_state=0) - X_new, y_new = estimator_orig.fit_resample(X, y) + X_new, y_new, kw = estimator_orig.fit_resample(X, y) set_random_state(estimator_orig, random_state=0) - X_new2, y_new2 = estimator_orig.fit_resample(X, y) + X_new2, y_new2, kw = estimator_orig.fit_resample(X, y) assert_array_equal(X_new, X_new2) assert_array_equal(y_new, y_new2) From 38c5d4dc98b96d2b619c156581bae52533fcc4fd Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 00:43:47 +0200 Subject: [PATCH 36/46] pep --- sklearn/utils/estimator_checks.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index add7d9960552b..293800efe624d 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -759,7 +759,6 @@ def check_dict_unchanged(name, estimator_orig): 'Estimator changes __dict__ during %s' % method) - def is_public_parameter(attr): return not (attr.startswith('_') or attr.endswith('_')) @@ -854,6 +853,7 @@ def check_fit2d_predict1d(name, estimator_orig): assert_raise_message(ValueError, "Reshape your data", getattr(estimator, method), X[0]) + def check_fit_resample2d(name, estimator_orig): # check by fit resampling a 2d array rnd = np.random.RandomState(0) @@ -874,6 +874,7 @@ def check_fit_resample2d(name, estimator_orig): set_random_state(estimator, 1) X, y, kw = estimator.fit_resample(X, y) + def _apply_on_subsets(func, X): # apply function on the whole set and on mini batches result_full = func(X) @@ -1230,7 +1231,6 @@ def check_estimators_dtypes(name, estimator_orig): getattr(estimator, "fit_resample")(X_train, y) - @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_estimators_empty_data_messages(name, estimator_orig): methods = ['fit', 'fit_resample', 'fit_transform'] @@ -2137,24 +2137,27 @@ def check_estimators_overwrite_params(name, estimator_orig): set_random_state(estimator) - # Make a physical copy of the original estimator parameters before fitting. + # Make a physical copy of the original estimator parameters before + # fitting. params = estimator.get_params() original_params = deepcopy(params) # Fit the model getattr(estimator, method)(X, y) - # Compare the state of the model parameters with the original parameters + # Compare the state of the model parameters with the original + # parameters new_params = estimator.get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # We should never change or mutate the internal state of input - # parameters by default. To check this we use the joblib.hash function - # that introspects recursively any subobjects to compute a checksum. - # The only exception to this rule of immutable constructor parameters - # is possible RandomState instance but in this check we explicitly - # fixed the random_state params recursively to be integer seeds. + # parameters by default. To check this we use the joblib.hash + # function that introspects recursively any subobjects to compute a + # checksum. The only exception to this rule of immutable + # constructor parameters is possible RandomState instance but in + # this check we explicitly fixed the random_state params + # recursively to be integer seeds. assert_equal(joblib.hash(new_value), joblib.hash(original_value), "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." From 59bb6c4b589c0b4f366e9d9364cb128b03d8333b Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 00:50:18 +0200 Subject: [PATCH 37/46] Local --- sklearn/utils/estimator_checks.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 293800efe624d..663c6f1f24a8c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -795,10 +795,10 @@ def check_dont_overwrite_parameters(name, estimator_orig): dict_after_fit = estimator.__dict__ public_keys_after_fit = [key for key in dict_after_fit.keys() - if is_public_parameter(key)] + if is_public_parameter(key)] attrs_added_by_fit = [key for key in public_keys_after_fit - if key not in dict_before_fit.keys()] + if key not in dict_before_fit.keys()] # check that fit doesn't add any public attribute assert not attrs_added_by_fit, ( @@ -1243,9 +1243,9 @@ def check_estimators_empty_data_messages(name, estimator_orig): # The precise message can change depending on whether X or y is # validated first. Let us test the type of exception only: with assert_raises(ValueError, msg="The estimator {} does not" - " raise an error when an empty data is used " - "to train. Perhaps use " - "check_array in train.".format(name)): + " raise an error when an empty data is used " + "to train. Perhaps use " + "check_array in train.".format(name)): getattr(e, method)(X_zero_samples, []) X_zero_features = np.empty(0).reshape(3, 0) @@ -1253,7 +1253,7 @@ def check_estimators_empty_data_messages(name, estimator_orig): # and ignored by unsupervised models y = enforce_estimator_tags_y(e, np.array([1, 0, 1])) msg = (r"0 feature\(s\) \(shape=\(\d*, 0\)\) while a minimum of \d* " - "is required.") + "is required.") assert_raises_regex( ValueError, msg, getattr(e, method), X_zero_features, y ) @@ -1264,7 +1264,7 @@ def check_estimators_nan_inf(name, estimator_orig): # Checks that Estimator X's do not contain NaN or inf. rnd = np.random.RandomState(0) X_train_finite = pairwise_estimator_convert_X(rnd.uniform(size=(10, 3)), - estimator_orig) + estimator_orig) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan X_train_inf = rnd.uniform(size=(10, 3)) @@ -1274,11 +1274,11 @@ def check_estimators_nan_inf(name, estimator_orig): y = enforce_estimator_tags_y(estimator_orig, y) error_string_fit = "Estimator doesn't check for NaN and inf in fit." error_string_fit_resample = ("Estimator doesn't check for NaN and inf in" - " fit_resample.") + " fit_resample.") error_string_predict = ("Estimator doesn't check for NaN and inf in" " predict.") error_string_transform = ("Estimator doesn't check for NaN and inf in" - " transform.") + " transform.") for X_train in [X_train_nan, X_train_inf]: # catch deprecation warnings with ignore_warnings(category=(DeprecationWarning, FutureWarning)): @@ -2158,10 +2158,10 @@ def check_estimators_overwrite_params(name, estimator_orig): # constructor parameters is possible RandomState instance but in # this check we explicitly fixed the random_state params # recursively to be integer seeds. - assert_equal(joblib.hash(new_value), joblib.hash(original_value), - "Estimator %s should not change or mutate " - " the parameter %s from %s to %s during fit." - % (name, param_name, original_value, new_value)) + assert joblib.hash(new_value) == joblib.hash(original_value), ( + "Estimator %s should not change or mutate " + " the parameter %s from %s to %s during fit." + % (name, param_name, original_value, new_value)) def check_no_attributes_set_in_init(name, estimator): From 3ad9ff8cf484bef3ae129684e128316dd461669c Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 10:06:01 +0200 Subject: [PATCH 38/46] Tests for fit are only left out if estimator is a resampler --- sklearn/preprocessing/data.py | 2 +- sklearn/utils/estimator_checks.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 19e0a5509e4d1..6286c09e6810f 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -3031,7 +3031,7 @@ class NaNFilter(BaseEstimator): >>> Xr, yr, _ = NaNFilter().fit_resample(data)) >>> print(Xr) [[1. 2.]] - >>> Xr, yr, _ = NaNFilter(count=2).fit_resample(data)) + >>> Xr, yr, _ = NaNFilter(count=2).fit_resample(data) >>> print(Xr) [[ 1. 2.] [ 1. nan]] diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 03ce17dfd1abe..78de1744c71a9 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -75,7 +75,7 @@ def _yield_checks(name, estimator): tags = _safe_tags(estimator) yield check_estimators_dtypes yield check_fit_score_takes_y - if hasattr(estimator, 'fit'): + if not hasattr(estimator, 'fit_resample') or hasattr(estimator, 'fit'): yield check_sample_weights_pandas_series yield check_sample_weights_list yield check_sample_weights_invariance @@ -107,7 +107,7 @@ def _yield_checks(name, estimator): yield check_estimator_sparse_data - if hasattr(estimator, 'fit'): + if not hasattr(estimator, 'fit_resample') or hasattr(estimator, 'fit'): # Test that estimators can be pickled, and once pickled # give the same answer as before. yield check_estimators_pickle @@ -272,19 +272,17 @@ def _yield_all_checks(name, estimator): if is_outlier_detector(estimator): for check in _yield_outliers_checks(name, estimator): yield check - if hasattr(estimator, 'fit'): - yield check_fit2d_predict1d - yield check_methods_subset_invariance yield check_fit2d_1sample yield check_fit2d_1feature yield check_fit1d yield check_get_params_invariance + yield check_dont_overwrite_parameters yield check_set_params - if hasattr(estimator, 'fit'): + if not hasattr(estimator, 'fit_resample') or hasattr(estimator, 'fit'): + yield check_fit2d_predict1d + yield check_methods_subset_invariance yield check_dict_unchanged - yield check_dont_overwrite_parameters - if hasattr(estimator, 'fit'): yield check_fit_idempotent @@ -1280,7 +1278,8 @@ def check_estimators_nan_inf(name, estimator_orig): estimator = clone(estimator_orig) set_random_state(estimator, 1) - if hasattr(estimator, 'fit'): + if (not hasattr(estimator, 'fit_resample') + or hasattr(estimator, 'fit')): # try to fit try: estimator.fit(X_train, y) From f4c8b7eb9e245a50c69e3e45ce9a30c692f0e099 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 10:24:54 +0200 Subject: [PATCH 39/46] pep --- sklearn/preprocessing/data.py | 2 +- sklearn/utils/estimator_checks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 6286c09e6810f..56baa8a4d95bb 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -3028,7 +3028,7 @@ class NaNFilter(BaseEstimator): >>> nan = float('nan') >>> data = [[1, 2], [1, nan], [nan, nan]] >>> y = [0, 0, 0] - >>> Xr, yr, _ = NaNFilter().fit_resample(data)) + >>> Xr, yr, _ = NaNFilter().fit_resample(data) >>> print(Xr) [[1. 2.]] >>> Xr, yr, _ = NaNFilter(count=2).fit_resample(data) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 78de1744c71a9..6b47f649bfcd4 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -1279,7 +1279,7 @@ def check_estimators_nan_inf(name, estimator_orig): set_random_state(estimator, 1) if (not hasattr(estimator, 'fit_resample') - or hasattr(estimator, 'fit')): + or hasattr(estimator, 'fit')): # try to fit try: estimator.fit(X_train, y) From 8a3e1f8ca2aca22f021985f814dd1701a5f4111a Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 10:44:37 +0200 Subject: [PATCH 40/46] Docs error --- sklearn/preprocessing/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 56baa8a4d95bb..038058302fd91 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -3028,10 +3028,10 @@ class NaNFilter(BaseEstimator): >>> nan = float('nan') >>> data = [[1, 2], [1, nan], [nan, nan]] >>> y = [0, 0, 0] - >>> Xr, yr, _ = NaNFilter().fit_resample(data) + >>> Xr, yr, _ = NaNFilter().fit_resample(data, y) >>> print(Xr) [[1. 2.]] - >>> Xr, yr, _ = NaNFilter(count=2).fit_resample(data) + >>> Xr, yr, _ = NaNFilter(count=2).fit_resample(data, y) >>> print(Xr) [[ 1. 2.] [ 1. nan]] From a38607c9b8fe712ec0e37510e685876f797970b8 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 12:51:16 +0200 Subject: [PATCH 41/46] Add check_X_y_kwargs --- sklearn/base.py | 9 +- sklearn/preprocessing/data.py | 11 +-- sklearn/utils/__init__.py | 4 +- sklearn/utils/estimator_checks.py | 4 + sklearn/utils/validation.py | 138 ++++++++++++++++++++++++++++++ 5 files changed, 149 insertions(+), 17 deletions(-) diff --git a/sklearn/base.py b/sklearn/base.py index 01fdc277f8b2e..ed19804fb0ef0 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -14,7 +14,7 @@ from . import __version__ from sklearn.utils import _IS_32BIT -from sklearn.utils import safe_indexing, check_X_y +from sklearn.utils import safe_indexing, check_X_y_kwargs _DEFAULT_TAGS = { 'non_deterministic': False, @@ -634,12 +634,7 @@ def fit_resample(self, X, y, **kws): dict of keyword arguments, with all outlier samples removed. """ - check_X_y(X, y) - # NOTE this is probably not the best way to do this - kws = { - kw: check_X_y(X, kws[kw], force_all_finite='allow-nan')[1] - for kw in kws - } + check_X_y_kwargs(X, y, kws) inliers = self.fit_predict(X) == 1 kwsr = { kw: safe_indexing(kws[kw], inliers) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 038058302fd91..49836cc65536d 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -20,7 +20,7 @@ from scipy.special import boxcox from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array, check_X_y, safe_indexing +from ..utils import check_array, check_X_y, check_X_y_kwargs, safe_indexing from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, @@ -3038,7 +3038,7 @@ class NaNFilter(BaseEstimator): See also -------- - :ref:`Imputation ` : removing NaNs by replacing them with values. + SimpleImputer : Transformer for completing missing values. """ def __init__(self, count=1): self.count = count @@ -3068,13 +3068,8 @@ def fit_resample(self, X, y, **kws): dict of keyword arguments, with all samples containing more than `count` NaN values removed. """ - X, y = check_X_y(X, y, force_all_finite='allow-nan') + X, y, kws = check_X_y_kwargs(X, y, kws, force_all_finite='allow-nan') - # NOTE this is probably not the best way to do this - kws = { - kw: check_X_y(X, kws[kw], force_all_finite='allow-nan')[1] - for kw in kws - } mask = np.sum(np.isnan(X), axis=1) < self.count kwsr = { kw: safe_indexing(kws[kw], mask) diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py index f24d61c879135..324ce632cb2be 100644 --- a/sklearn/utils/__init__.py +++ b/sklearn/utils/__init__.py @@ -20,8 +20,8 @@ from .validation import (as_float_array, assert_all_finite, check_random_state, column_or_1d, check_array, - check_consistent_length, check_X_y, indexable, - check_symmetric, check_scalar) + check_consistent_length, check_X_y, check_X_y_kwargs, + indexable, check_symmetric, check_scalar) from .. import get_config diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6b47f649bfcd4..7b582e2bfed67 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2630,6 +2630,10 @@ def check_resampler_structure(name, estimator_orig): def check_resample_fails_on_non_matching_shapes(): + # check that resamplers enforce matching shapes between kwargs, X and y + pass + +def check_resample_resamples_kwargs(): pass diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index bb6cf1c8ffe00..1a44d17a7967b 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -596,6 +596,144 @@ def _check_large_sparse(X, accept_large_sparse=False): % indices_datatype) +def check_X_y_kwargs(X, y, kwargs, accept_sparse=False, + accept_large_sparse=True, dtype="numeric", order=None, + copy=False, force_all_finite=True, ensure_2d=True, + allow_nd=False, multi_output=False, ensure_min_samples=1, + ensure_min_features=1, y_numeric=False, + warn_on_dtype=None, estimator=None): + """Input validation for standard estimators. + + Checks X, y and all kwargs for consistent length, enforces X to be 2D and y + and kwargs 1D. By default, X is checked to be non-empty and containing only + finite values. Standard input checks are also applied to y, such as checking + that y does not have np.nan or np.inf targets. For multi-label y, set + multi_output=True to allow 2D and sparse y. If the dtype of X is object, + attempt converting to float, raising on failure. + + Further, kwargs are checked to not have np.nan or np.inf. + + Parameters + ---------- + X : nd-array, list or sparse matrix + Input data. + + y : nd-array, list or sparse matrix + Labels. + + accept_sparse : string, boolean or list of string (default=False) + String[s] representing allowed sparse matrix formats, such as 'csc', + 'csr', etc. If the input is sparse but not in the allowed format, + it will be converted to the first listed format. True allows the input + to be any format. False means that a sparse matrix input will + raise an error. + + accept_large_sparse : bool (default=True) + If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by + accept_sparse, accept_large_sparse will cause it to be accepted only + if its indices are stored with a 32-bit dtype. + + .. versionadded:: 0.20 + + dtype : string, type, list of types or None (default="numeric") + Data type of result. If None, the dtype of the input is preserved. + If "numeric", dtype is preserved unless array.dtype is object. + If dtype is a list of types, conversion on the first type is only + performed if the dtype of the input is not in the list. + + order : 'F', 'C' or None (default=None) + Whether an array will be forced to be fortran or c-style. + + copy : boolean (default=False) + Whether a forced copy will be triggered. If copy=False, a copy might + be triggered by a conversion. + + force_all_finite : boolean or 'allow-nan', (default=True) + Whether to raise an error on np.inf and np.nan in X. This parameter + does not influence whether y can have np.inf or np.nan values. + The possibilities are: + + - True: Force all values of X to be finite. + - False: accept both np.inf and np.nan in X. + - 'allow-nan': accept only np.nan values in X. Values cannot be + infinite. + + .. versionadded:: 0.20 + ``force_all_finite`` accepts the string ``'allow-nan'``. + + ensure_2d : boolean (default=True) + Whether to raise a value error if X is not 2D. + + allow_nd : boolean (default=False) + Whether to allow X.ndim > 2. + + multi_output : boolean (default=False) + Whether to allow 2D y (array or sparse matrix). If false, y will be + validated as a vector. y cannot have np.nan or np.inf values if + multi_output=True. + + ensure_min_samples : int (default=1) + Make sure that X has a minimum number of samples in its first + axis (rows for a 2D array). + + ensure_min_features : int (default=1) + Make sure that the 2D array has some minimum number of features + (columns). The default value of 1 rejects empty datasets. + This check is only enforced when X has effectively 2 dimensions or + is originally 1D and ``ensure_2d`` is True. Setting to 0 disables + this check. + + y_numeric : boolean (default=False) + Whether to ensure that y has a numeric type. If dtype of y is object, + it is converted to float64. Should only be used for regression + algorithms. + + warn_on_dtype : boolean or None, optional (default=None) + Raise DataConversionWarning if the dtype of the input data structure + does not match the requested dtype, causing a memory copy. + + .. deprecated:: 0.21 + ``warn_on_dtype`` is deprecated in version 0.21 and will be + removed in 0.23. + + estimator : str or estimator instance (default=None) + If passed, include the name of the estimator in warning messages. + + Returns + ------- + X_converted : object + The converted and validated X. + + y_converted : object + The converted and validated y. + + kwargs_converted: dict of string -> object + The converted and validated kwargs + """ + X_converted, y_converted = check_X_y( + X, y, + accept_sparse=accept_sparse, accept_large_sparse=accept_large_sparse, + dtype=dtype, order=order, copy=copy, force_all_finite=force_all_finite, + ensure_2d=ensure_2d, allow_nd=allow_nd, multi_output=multi_output, + ensure_min_samples=ensure_min_samples, + ensure_min_features=ensure_min_features, y_numeric=y_numeric, + warn_on_dtype=warn_on_dtype, estimator=estimator + ) + kwargs_converted = { + kw: check_array( + kwargs[kw], force_all_finite=True, dtype="numeric", + ensure_2d=False + ) + for kw in kwargs + } + check_consistent_length( + X_converted, y_converted, *(kwargs_converted[kw] for kw in + kwargs_converted) + ) + + return X_converted, y_converted, kwargs_converted + + def check_X_y(X, y, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, From 8a280477931515674e7b65d7b161fe73bf835505 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 13:02:49 +0200 Subject: [PATCH 42/46] Remove deprecated parameter from check_X_y_kwargs --- sklearn/preprocessing/data.py | 2 +- sklearn/utils/estimator_checks.py | 1 + sklearn/utils/validation.py | 25 ++++++------------------- 3 files changed, 8 insertions(+), 20 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 49836cc65536d..3778f7bc4d4f5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -20,7 +20,7 @@ from scipy.special import boxcox from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array, check_X_y, check_X_y_kwargs, safe_indexing +from ..utils import check_array, check_X_y_kwargs, safe_indexing from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 7b582e2bfed67..ea3c25396ac0c 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -2633,6 +2633,7 @@ def check_resample_fails_on_non_matching_shapes(): # check that resamplers enforce matching shapes between kwargs, X and y pass + def check_resample_resamples_kwargs(): pass diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py index 1a44d17a7967b..110cc1c6e82ea 100644 --- a/sklearn/utils/validation.py +++ b/sklearn/utils/validation.py @@ -601,15 +601,15 @@ def check_X_y_kwargs(X, y, kwargs, accept_sparse=False, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, multi_output=False, ensure_min_samples=1, ensure_min_features=1, y_numeric=False, - warn_on_dtype=None, estimator=None): + estimator=None): """Input validation for standard estimators. Checks X, y and all kwargs for consistent length, enforces X to be 2D and y and kwargs 1D. By default, X is checked to be non-empty and containing only - finite values. Standard input checks are also applied to y, such as checking - that y does not have np.nan or np.inf targets. For multi-label y, set - multi_output=True to allow 2D and sparse y. If the dtype of X is object, - attempt converting to float, raising on failure. + finite values. Standard input checks are also applied to y, such as + checking that y does not have np.nan or np.inf targets. For multi-label y, + set multi_output=True to allow 2D and sparse y. If the dtype of X is + object, attempt converting to float, raising on failure. Further, kwargs are checked to not have np.nan or np.inf. @@ -633,8 +633,6 @@ def check_X_y_kwargs(X, y, kwargs, accept_sparse=False, accept_sparse, accept_large_sparse will cause it to be accepted only if its indices are stored with a 32-bit dtype. - .. versionadded:: 0.20 - dtype : string, type, list of types or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. @@ -658,9 +656,6 @@ def check_X_y_kwargs(X, y, kwargs, accept_sparse=False, - 'allow-nan': accept only np.nan values in X. Values cannot be infinite. - .. versionadded:: 0.20 - ``force_all_finite`` accepts the string ``'allow-nan'``. - ensure_2d : boolean (default=True) Whether to raise a value error if X is not 2D. @@ -688,14 +683,6 @@ def check_X_y_kwargs(X, y, kwargs, accept_sparse=False, it is converted to float64. Should only be used for regression algorithms. - warn_on_dtype : boolean or None, optional (default=None) - Raise DataConversionWarning if the dtype of the input data structure - does not match the requested dtype, causing a memory copy. - - .. deprecated:: 0.21 - ``warn_on_dtype`` is deprecated in version 0.21 and will be - removed in 0.23. - estimator : str or estimator instance (default=None) If passed, include the name of the estimator in warning messages. @@ -717,7 +704,7 @@ def check_X_y_kwargs(X, y, kwargs, accept_sparse=False, ensure_2d=ensure_2d, allow_nd=allow_nd, multi_output=multi_output, ensure_min_samples=ensure_min_samples, ensure_min_features=ensure_min_features, y_numeric=y_numeric, - warn_on_dtype=warn_on_dtype, estimator=estimator + estimator=estimator ) kwargs_converted = { kw: check_array( From d0b2789be5f2786f5583f5dd3c1736922fbb39b7 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 13:05:50 +0200 Subject: [PATCH 43/46] check_X_y_kwargs in fit of `ResampledTrainer` --- sklearn/compose/_resampled.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index 1c13b69a3c08f..6ec28a1b8b7b1 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -2,7 +2,7 @@ from ..base import BaseEstimator, MetaEstimatorMixin, clone from ..utils.metaestimators import if_delegate_has_method -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_X_y_kwargs class ResampledTrainer(MetaEstimatorMixin, BaseEstimator): @@ -54,13 +54,10 @@ def __init__(self, resampler, estimator): # TODO: tags? def fit(self, X, y=None, **kw): + X, y, kw = check_X_y_kwargs(X, y, kw) self.resampler_ = clone(self.resampler) - ret = self.resampler_.fit_resample(X, y, **kw) - if len(ret) == 2: - kw = {} - X, y = ret - else: - X, y, kw = ret + X, y, kw = self.resampler_.fit_resample(X, y, **kw) + self.estimator_ = clone(self.estimator).fit(X, y, **kw) return self From 26e53b9e6c143bf6a686f21abd371abee545e1c8 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 13:12:32 +0200 Subject: [PATCH 44/46] Check ResampledTrainer correctly resamples kwargs --- sklearn/compose/tests/test_resampled.py | 27 +++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/sklearn/compose/tests/test_resampled.py b/sklearn/compose/tests/test_resampled.py index 585fbc28c84ae..6478e4d8c88b0 100644 --- a/sklearn/compose/tests/test_resampled.py +++ b/sklearn/compose/tests/test_resampled.py @@ -8,25 +8,27 @@ from sklearn.pipeline import Pipeline from sklearn.compose import ResampledTrainer from sklearn.utils.estimator_checks import check_estimator -from sklearn.utils.validation import _num_samples, check_X_y +from sklearn.utils.validation import _num_samples, check_X_y_kwargs class HalfSampler(BaseEstimator): "Train with every second sample" - def fit_resample(self, X, y, **kw): - X, y = check_X_y(X, y, accept_sparse="csr") + def fit_resample(self, X, y, **kws): + X, y, kws = check_X_y_kwargs(X, y, kws, accept_sparse="csr") if _num_samples(X) > 1: - return X[::2], y[::2] - return X, y + return X[::2], y[::2], {kw: kws[kw][::2] for kw in kws} + + return X, y, kws class DataSaver(BaseEstimator): "remembers the data that it was fitted with" - def fit(self, X, y): + def fit(self, X, y, **kws): self.X = X self.y = y + self.kws = kws return self def predict(self, X): @@ -52,6 +54,19 @@ def test_correct_halfsampler(): np.testing.assert_array_equal( rt.estimator_.y, np.array([0, 2, 4, 6, 8]) ) + assert rt.estimator_.kws == {} + method(X, y, sample_weight=np.arange(10, 20), + sample_prop=np.arange(20, 30)) + + np.testing.assert_array_equal( + rt.estimator_.y, np.array([0, 2, 4, 6, 8]) + ) + np.testing.assert_array_equal( + rt.estimator_.kws['sample_weight'], np.array([10, 12, 14, 16, 18]) + ) + np.testing.assert_array_equal( + rt.estimator_.kws['sample_prop'], np.array([20, 22, 24, 26, 28]) + ) def test_pca_outlier_svm(): From 87181b97cd65ab34fa9c415de736cc596c65c306 Mon Sep 17 00:00:00 2001 From: Oliver Rausch Date: Wed, 3 Jul 2019 13:15:10 +0200 Subject: [PATCH 45/46] Remove fit_predict --- sklearn/compose/_resampled.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py index 6ec28a1b8b7b1..97b8a81973bc5 100644 --- a/sklearn/compose/_resampled.py +++ b/sklearn/compose/_resampled.py @@ -103,16 +103,6 @@ def fit_transform(X, y, **kwargs): return fit_transform - @property - def fit_predict(self): - self.estimator.predict - - def fit_predict(X, y, **kwargs): - self.fit(X, y, **kwargs) - return self.estimator_.predict(X) - - return fit_predict - @property def _estimator_type(self): return self.estimator._estimator_type From 6220843de8c4545745d4532adb82d16f505b2c63 Mon Sep 17 00:00:00 2001 From: Joel Nothman Date: Thu, 20 Aug 2020 09:22:49 +1000 Subject: [PATCH 46/46] Some extensions to glossary --- doc/glossary.rst | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/doc/glossary.rst b/doc/glossary.rst index de8c24d4914de..ecad1132f2356 100644 --- a/doc/glossary.rst +++ b/doc/glossary.rst @@ -964,7 +964,9 @@ Class APIs and Estimator Types resampler resamplers - An estimator supporting :term:`fit_resample`. + An estimator supporting :term:`fit_resample`. This can be used in a + :class:`ResampledTrainer` to resample, augment or reduce the training + dataset passed to another estimator. vectorizer vectorizers @@ -1236,9 +1238,25 @@ Methods this results in :term:`data leakage`. ``fit_resample`` - A method on :term:`resamplers` which fits the estimator on a passed - dataset, and returns a new dataset. In the new dataset, samples may be - removed or added. + A method whose presence in an estimator is sufficient and necessary for + it to be a :term:`resampler`. + When called it should fit the estimator and return a new + dataset. In the new dataset, samples may be removed, added or modified. + In contrast to :term:`fit_transform`: + * X, y, and any other sample-aligned data may be generated; + * the samples in the returned dataset need not have any alignment or + correspondence to the input dataset. + + This method has the signature ``fit_resample(X, y, **kw)`` and returns + a 3-tuple ``X_new, y_new, kw_new`` where ``kw_new`` is a dict mapping + names to data-aligned values that should be passed as fit parameters + to the subsequent estimator. Any keyword arguments passed in should be + resampled and returned, and if the resampler is not capable of + resampling the keyword arguments, it should raise a TypeError. + + Ordinarily, this method is only called by a :class:`ResampledTrainer`, + which acts like a specialised pipeline for cases when the training data + should be augmented or resampled. ``get_feature_names`` Primarily for :term:`feature extractors`, but also used for other