scikit-learn · orausch · Feb 26, 2019 · Feb 26, 2019 · Feb 26, 2019 · Feb 27, 2019
diff --git a/doc/developers/contributing.rst b/doc/developers/contributing.rst
@@ -1108,7 +1108,7 @@ multiple interfaces):
 
 :Transformer:
 
-    For filtering or modifying the data, in a supervised or unsupervised
+    For modifying the data, in a supervised or unsupervised
     way, implements::
 
       new_data = transformer.transform(data)
@@ -1118,6 +1118,13 @@ multiple interfaces):
 
       new_data = transformer.fit_transform(data)
 
+:Resamplers:
+
+    For filtering or augmenting the data, in a supervised or unsupervised
+    way, implements::
+
+      new_X, new_y = transformer.fit_resample(data_X, data_y)
+
 :Model:
 
     A model that can give a `goodness of fit <https://en.wikipedia.org/wiki/Goodness_of_fit>`_

diff --git a/doc/glossary.rst b/doc/glossary.rst
@@ -918,6 +918,19 @@ Class APIs and Estimator Types
         outliers have score below 0.  :term:`score_samples` may provide an
         unnormalized score per sample.
 
+    outlier rejector
+    outlier rejectors
+        An :term:`outlier detector` which is a resampler. It will remove
+        outliers from a passed dataset when :term:`fit_resample` is called.
+
+        Outlier detectors must implement:
+
+        * :term:`fit_resample`
+
+        If the estimator implements :term:`fit_predict` according to the
+        :class:`OutlierMixin` API, :class:`OutlierRejectorMixin` should be used
+        to automatically implement correct :term:`fit_resample` behavior.
+
     predictor
     predictors
         An :term:`estimator` supporting :term:`predict` and/or
@@ -949,6 +962,12 @@ Class APIs and Estimator Types
         A purely :term:`transductive` transformer, such as
         :class:`manifold.TSNE`, may not implement ``transform``.
 
+    resampler
+    resamplers
+        An estimator supporting :term:`fit_resample`. This can be used in a
+        :class:`ResampledTrainer` to resample, augment or reduce the training
+        dataset passed to another estimator.
+
     vectorizer
     vectorizers
         See :term:`feature extractor`.
@@ -1218,6 +1237,27 @@ Methods
         (i.e. training and test data together) before further modelling, as
         this results in :term:`data leakage`.
 
+    ``fit_resample``
+        A method whose presence in an estimator is sufficient and necessary for
+        it to be a :term:`resampler`.
+        When called it should fit the estimator and return a new
+        dataset. In the new dataset, samples may be removed, added or modified.
+        In contrast to :term:`fit_transform`:
+        * X, y, and any other sample-aligned data may be generated;
+        * the samples in the returned dataset need not have any alignment or
+          correspondence to the input dataset.
+
+        This method has the signature ``fit_resample(X, y, **kw)`` and returns
+        a 3-tuple ``X_new, y_new, kw_new`` where ``kw_new`` is a dict mapping
+        names to data-aligned values that should be passed as fit parameters
+        to the subsequent estimator. Any keyword arguments passed in should be
+        resampled and returned, and if the resampler is not capable of
+        resampling the keyword arguments, it should raise a TypeError.
+
+        Ordinarily, this method is only called by a :class:`ResampledTrainer`,
+        which acts like a specialised pipeline for cases when the training data
+        should be augmented or resampled.
+
     ``get_feature_names``
         Primarily for :term:`feature extractors`, but also used for other
         transformers to provide string names for each column in the output of

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -30,6 +30,8 @@ Base classes
    base.BiclusterMixin
    base.ClassifierMixin
    base.ClusterMixin
+   base.OutlierMixin
+   base.OutlierRejectorMixin
    base.DensityMixin
    base.RegressorMixin
    base.TransformerMixin
@@ -164,6 +166,7 @@ details.
     :template: class.rst
 
     compose.ColumnTransformer
+    compose.ResampledTrainer
     compose.TransformedTargetRegressor
 
 .. autosummary::

diff --git a/doc/modules/compose.rst b/doc/modules/compose.rst
@@ -5,14 +5,16 @@
 Pipelines and composite estimators
 ==================================
 
-Transformers are usually combined with classifiers, regressors or other
+Transformers and resamplers are usually combined with classifiers, regressors
+or other
 estimators to build a composite estimator.  The most common tool is a
 :ref:`Pipeline <pipeline>`. Pipeline is often used in combination with
 :ref:`FeatureUnion <feature_union>` which concatenates the output of
 transformers into a composite feature space.  :ref:`TransformedTargetRegressor
 <transformed_target_regressor>` deals with transforming the :term:`target`
 (i.e. log-transform :term:`y`). In contrast, Pipelines only transform the
-observed data (:term:`X`).
+observed data (:term:`X`). Additionally, pipelines support :term:`resamplers` to
+resample the dataset on fit (see :ref:`_pipeline_resamplers`).
 
 .. _pipeline:
 
@@ -236,6 +238,46 @@ object::
 
  * :ref:`sphx_glr_auto_examples_compose_plot_compare_reduction.py`
 
+.. _pipeline_resamplers:
+
+Resampling or modifying samples in training
+===========================================
+
+All transformers in a Pipeline must output a dataset with samples corresponding
+to their input.  Sometimes you want a process to modify the set of samples
+used in training, such as balanced resampling, outlier remover, or data
+augmentation/perturbation.  Such processes are called Resamplers, rather than
+Transformers, in Scikit-learn, and should be composed with a predictor using
+a :class:`compose.ResampledTrainer` rather than a Pipeline. Resamplers provide
+a `fit_resample` method which is called by the ``ResampledTrainer`` when
+fitting, so that the resampled data is used to train the subsequent predictor.
+
+:ref:`outlier rejectors` provide `fit_resample` methods that remove samples
+from the dataset if they classified as outliers.  Consider the following::
+
+    >>> from sklearn.compose import ResampledTrainer
+    >>> from sklearn.covariance import EllipticEnvelope
+    >>> from sklearn.linear_model import LogisticRegression
+    >>> resampled = ResampledTrainer(EllipticEnvelope(), LogisticRegression())
+    >>> from sklearn.datasets import load_iris
+    >>> X, y = load_iris(return_X_y=True)
+    >>> resampled.fit(X, y)
+    ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+    ResampledTrainer(...)
+
+
+In ``pipe``, we remove outliers before fitting our `LogisticRegression`
+model, so that the samples passed to fit come from the same distribution. We do
+this to improve the quality of the fit (see :ref:`_outlier_detection`).
+Therefore, during ``fit``, we want our resampler to be applied.
+
+Now assume that we would like to make predictions on some new data ``X_test``::
+
+    >>> predictions = pipe.predict(X_test)
+
+This does not apply resampling, but provides predictions for all samples in
+``X_test``.
+
 .. _transformed_target_regressor:
 
 Transforming target in regression
@@ -327,8 +369,7 @@ is fit to the data independently. The transformers are applied in parallel,
 and the feature matrices they output are concatenated side-by-side into a
 larger matrix.
 
-When you want to apply different transformations to each field of the data,
-see the related class :class:`sklearn.compose.ColumnTransformer`
+When you want to apply different transformations to each field of the data, see the related class :class:`sklearn.compose.ColumnTransformer`
 (see :ref:`user guide <column_transformer>`).
 
 :class:`FeatureUnion` serves the same purposes as :class:`Pipeline` -

diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
@@ -349,6 +349,16 @@ This strategy is illustrated below.
       <http://www.dbs.ifi.lmu.de/Publikationen/Papers/LOF.pdf>`_
       Proc. ACM SIGMOD
 
+.. _outlier_rejectors
+
+Outlier Rejectors
+-----------------
+All :term:`outlier detectors` can be used as :term:`outlier rejectors`, a form
+of :term:`resampler` that takes in a dataset, and returns a new dataset that is
+the same dataset, but with the outliers removed. This is especially useful for
+pipelines. See :ref:`pipeline_resamplers` and the examples.
+
+.. topic:: Examples:
 .. _novelty_with_lof:
 
 Novelty detection with Local Outlier Factor

diff --git a/sklearn/base.py b/sklearn/base.py
@@ -13,7 +13,8 @@
 import numpy as np
 
 from . import __version__
-from .utils import _IS_32BIT
+from sklearn.utils import _IS_32BIT
+from sklearn.utils import safe_indexing, check_X_y_kwargs
 
 _DEFAULT_TAGS = {
     'non_deterministic': False,
@@ -603,6 +604,45 @@ def fit_predict(self, X, y=None):
         return self.fit(X).predict(X)
 
 
+class OutlierRejectionMixin:
+    """Mixin class for all outlier detection resamplers in scikit-learn. Child
+    classes remove outliers from the dataset.
+    """
+    _estimator_type = "outlier_rejector"
+
+    def fit_resample(self, X, y, **kws):
+        """Performs fit on X and returns a new X and y consisting of only the
+        inliers.
+
+        Parameters
+        ----------
+        X : ndarray, shape (n_samples, n_features)
+            Input data X.
+
+        y : ndarray, shape (n_samples,)
+            Input data y.
+
+        Returns
+        -------
+        X : ndarray, shape (n_samples, n_features)
+            The original X with outlier samples removed.
+
+        y : ndarray, shape (n_samples,)
+            The original y with outlier samples removed.
+
+        kws : dict of ndarray
+             dict of keyword arguments, with all outlier samples removed.
+        """
+
+        check_X_y_kwargs(X, y, kws)
+        inliers = self.fit_predict(X) == 1
+        kwsr = {
+            kw: safe_indexing(kws[kw], inliers)
+            for kw in kws
+        }
+        return safe_indexing(X, inliers), safe_indexing(y, inliers), kwsr
+
+
 class MetaEstimatorMixin:
     _required_parameters = ["estimator"]
     """Mixin class for all meta estimators in scikit-learn."""

diff --git a/sklearn/compose/__init__.py b/sklearn/compose/__init__.py
@@ -7,10 +7,12 @@
 
 from ._column_transformer import ColumnTransformer, make_column_transformer
 from ._target import TransformedTargetRegressor
+from ._resampled import ResampledTrainer
 
 
 __all__ = [
     'ColumnTransformer',
     'make_column_transformer',
     'TransformedTargetRegressor',
+    'ResampledTrainer',
 ]
diff --git a/sklearn/compose/_resampled.py b/sklearn/compose/_resampled.py
@@ -0,0 +1,112 @@
+# Author: Joel Nothman
+
+from ..base import BaseEstimator, MetaEstimatorMixin, clone
+from ..utils.metaestimators import if_delegate_has_method
+from ..utils.validation import check_is_fitted, check_X_y_kwargs
+
+
+class ResampledTrainer(MetaEstimatorMixin, BaseEstimator):
+    """Composition of a resampler and a estimator
+
+    Read more in the :ref:`User Guide <pipeline_resamplers>`.
+
+    Parameters
+    ----------
+    resampler : Estimator supporting fit_resample
+
+    estimator : Estimator
+
+    Attributes
+    ----------
+    resampler_ : Estimator
+        Fitted clone of `resampler`.
+
+    estimator_ : Estimator
+        Fitted clone of `estimator`.
+
+    Examples
+    --------
+    >>> from sklearn.base import BaseEstimator
+    >>> from sklearn.compose import ResampledTrainer
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.linear_model import LogisticRegression
+    >>>
+    >>> class HalfSampler(BaseEstimator):
+    ...     "Train with every second sample"
+    ...     def fit_resample(self, X, y, **kw):
+    ...         return X[::2], y[::2]
+    >>>
+    >>> est = ResampledTrainer(HalfSampler(), LogisticRegression())
+    >>> X, y = load_iris(return_X_y=True)
+    >>> est.fit(X, y)
+    ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
+    ResampledTrainer(...)
+    >>> est.predict(X[:2])
+    array([0, 0])
+    """
+
+    def __init__(self, resampler, estimator):
+        self.resampler = resampler
+        self.estimator = estimator
+
+    _required_parameters = ["resampler", "estimator"]
+
+    # TODO: tags?
+
+    def fit(self, X, y=None, **kw):
+        X, y, kw = check_X_y_kwargs(X, y, kw)
+        self.resampler_ = clone(self.resampler)
+        X, y, kw = self.resampler_.fit_resample(X, y, **kw)
+
+        self.estimator_ = clone(self.estimator).fit(X, y, **kw)
+        return self
+
+    @if_delegate_has_method(delegate="estimator")
+    def predict(self, X, **predict_params):
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict(X, **predict_params)
+
+    @if_delegate_has_method(delegate="estimator")
+    def transform(self, X):
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.transform(X)
+
+    @if_delegate_has_method(delegate="estimator")
+    def predict_proba(self, X):
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict_proba(X)
+
+    @if_delegate_has_method(delegate="estimator")
+    def predict_log_proba(self, X):
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.predict_log_proba(X)
+
+    @if_delegate_has_method(delegate="estimator")
+    def decision_function(self, X):
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.decision_function(X)
+
+    @if_delegate_has_method(delegate="estimator")
+    def score(self, X, y, **kw):
+        check_is_fitted(self, "estimator_")
+        return self.estimator_.score(X, y, **kw)
+
+    @property
+    def fit_transform(self):
+        # check if the estimator has a transform function
+        self.estimator.transform
+
+        def fit_transform(X, y, **kwargs):
+            self.fit(X, y, **kwargs)
+            # since estimator_ exists now, we can return transform
+            return self.estimator_.transform(X)
+
+        return fit_transform
+
+    @property
+    def _estimator_type(self):
+        return self.estimator._estimator_type
+
+    @property
+    def classes_(self):
+        return self.estimator_.classes_