scikit-learn · larsmans · Aug 3, 2015 · Jun 1, 2015 · Jul 30, 2015 · Jun 8, 2015
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1104,6 +1104,7 @@ See the :ref:`metrics` section of the user guide for further details.
    :template: class.rst
 
    preprocessing.Binarizer
+   preprocessing.FunctionTransformer
    preprocessing.Imputer
    preprocessing.KernelCenterer
    preprocessing.LabelBinarizer

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -508,3 +508,23 @@ The features of X have been transformed from :math:`(X_1, X_2, X_3)` to :math:`(
 Note that polynomial features are used implicitily in `kernel methods <http://en.wikipedia.org/wiki/Kernel_method>`_ (e.g., :class:`sklearn.svm.SVC`, :class:`sklearn.decomposition.KernelPCA`) when using polynomial :ref:`svm_kernels`.
 
 See :ref:`example_linear_model_plot_polynomial_interpolation.py` for Ridge regression using created polynomial features.
+
+Custom Transformers
+===================
+
+Often, you will want to convert an existing python function into a transformer
+to assist in data cleaning or processing. Users may implement a transformer from
+an arbitrary function with :class:`FunctionTransformer`. For example, one could
+apply a log transformation in a pipeline like::
+
+    >>> import numpy as np
+    >>> from sklearn.preprocessing import FunctionTransformer
+    >>> transformer = FunctionTransformer(np.log)
+    >>> X = np.array([[1, 2], [3, 4]])
+    >>> transformer.transform(X)
+    array([[ 0.        ,  0.69314718],
+           [ 1.09861229,  1.38629436]])
+
+For a full code example that demonstrates using a :class:`FunctionTransformer`
+to do column selection,
+see :ref:`example_preprocessing_plot_function_transformer.py`
diff --git a/examples/preprocessing/plot_function_transformer.py b/examples/preprocessing/plot_function_transformer.py
@@ -0,0 +1,69 @@
+"""
+=========================================================
+Using FunctionTransformer to select columns
+=========================================================
+
+Shows how to use a function transformer in a pipeline. If you know your
+dataset's first principle component is irrelevant for a classification task,
+you can use the FunctionTransformer to select all but the first column of the
+PCA transformed data.
+"""
+import matplotlib.pyplot as plt
+import numpy as np
+
+from sklearn.cross_validation import train_test_split
+from sklearn.decomposition import PCA
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import FunctionTransformer
+
+
+def _generate_vector(shift=0.5, noise=15):
+    return np.arange(1000) + (np.random.rand(1000) - shift) * noise
+
+
+def generate_dataset():
+    """
+    This dataset is two lines with a slope ~ 1, where one has
+    a y offset of ~100
+    """
+    return np.vstack((
+        np.vstack((
+            _generate_vector(),
+            _generate_vector() + 100,
+        )).T,
+        np.vstack((
+            _generate_vector(),
+            _generate_vector(),
+        )).T,
+    )), np.hstack((np.zeros(1000), np.ones(1000)))
+
+
+def all_but_first_column(X):
+    return X[:, 1:]
+
+
+def drop_first_component(X, y):
+    """
+    Create a pipeline with PCA and the column selector and use it to
+    transform the dataset.
+    """
+    pipeline = make_pipeline(
+        PCA(), FunctionTransformer(all_but_first_column),
+    )
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+    pipeline.fit(X_train, y_train)
+    return pipeline.transform(X_test), y_test
+
+
+if __name__ == '__main__':
+    X, y = generate_dataset()
+    plt.scatter(X[:, 0], X[:, 1], c=y, s=50)
+    plt.show()
+    X_transformed, y_transformed = drop_first_component(*generate_dataset())
+    plt.scatter(
+        X_transformed[:, 0],
+        np.zeros(len(X_transformed)),
+        c=y_transformed,
+        s=50,
+    )
+    plt.show()
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -3,6 +3,8 @@
 normalization, binarization and imputation methods.
 """
 
+from .function_transformer import FunctionTransformer
+
 from .data import Binarizer
 from .data import KernelCenterer
 from .data import MinMaxScaler
@@ -28,8 +30,10 @@
 
 from .imputation import Imputer
 
+
 __all__ = [
     'Binarizer',
+    'FunctionTransformer',
     'Imputer',
     'KernelCenterer',
     'LabelBinarizer',

diff --git a/sklearn/preprocessing/function_transformer.py b/sklearn/preprocessing/function_transformer.py
@@ -0,0 +1,66 @@
+from ..base import BaseEstimator, TransformerMixin
+from ..utils import check_array
+
+
+def _identity(X):
+    """The identity function.
+    """
+    return X
+
+
+class FunctionTransformer(BaseEstimator, TransformerMixin):
+    """Constructs a transformer from an arbitrary callable.
+
+    A FunctionTransformer forwards its X (and optionally y) arguments to a
+    user-defined function or function object and returns the result of this
+    function. This is useful for stateless transformations such as taking the
+    log of frequencies, doing custom scaling, etc.
+
+    A FunctionTransformer will not do any checks on its function's output.
+
+    Note: If a lambda is used as the function, then the resulting
+    transformer will not be pickleable.
+
+    Parameters
+    ----------
+    func : callable, optional default=None
+        The callable to use for the transformation. This will be passed
+        the same arguments as transform, with args and kwargs forwarded.
+        If func is None, then func will be the identity function.
+
+    validate : bool, optional default=True
+        Indicate that the input X array should be checked before calling
+        func. If validate is false, there will be no input validation.
+        If it is true, then X will be converted to a 2-dimensional NumPy
+        array or sparse matrix. If this conversion is not possible or X
+        contains NaN or infinity, an exception is raised.
+
+    accept_sparse : boolean, optional
+        Indicate that func accepts a sparse matrix as input. If validate is
+        False, this has no effect. Otherwise, if accept_sparse is false,
+        sparse matrix inputs will cause an exception to be raised.
+
+    pass_y: bool, optional default=False
+        Indicate that transform should forward the y argument to the
+        inner callable.
+
+    """
+    def __init__(self, func=None, validate=True,
+                 accept_sparse=False, pass_y=False):
+        self.func = func
+        self.validate = validate
+        self.accept_sparse = accept_sparse
+        self.pass_y = pass_y
+
+    def fit(self, X, y=None):
+        if self.validate:
+            check_array(X, self.accept_sparse)
+        return self
+
+    def transform(self, X, y=None):
+        if self.validate:
+            X = check_array(X, self.accept_sparse)
+        func = self.func if self.func is not None else _identity
+
+
+        return func(X, *((y,) if self.pass_y else ()))
diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py
@@ -0,0 +1,83 @@
+from nose.tools import assert_equal
+import numpy as np
+
+from ..function_transformer import FunctionTransformer
+
+
+def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
+    def _func(X, *args, **kwargs):
+        args_store.append(X)
+        args_store.extend(args)
+        kwargs_store.update(kwargs)
+        return func(X)
+
+    return _func
+
+
+def test_delegate_to_func():
+    # (args|kwargs)_store will hold the positional and keyword arguments
+    # passed to the function inside the FunctionTransformer.
+    args_store = []
+    kwargs_store = {}
+    X = np.arange(10).reshape((5, 2))
+    np.testing.assert_array_equal(
+        FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
+        X,
+        'transform should have returned X unchanged',
+    )
+
+    # The function should only have recieved X.
+    assert_equal(
+        args_store,
+        [X],
+        'Incorrect positional arguments passed to func: {args}'.format(
+            args=args_store,
+        ),
+    )
+    assert_equal(
+        kwargs_store,
+        {},
+        'Unexpected keyword arguments passed to func: {args}'.format(
+            args=kwargs_store,
+        ),
+    )
+
+    # reset the argument stores.
+    args_store[:] = []  # python2 compatible inplace list clear.
+    kwargs_store.clear()
+    y = object()
+
+    np.testing.assert_array_equal(
+        FunctionTransformer(
+            _make_func(args_store, kwargs_store),
+            pass_y=True,
+        ).transform(X, y),
+        X,
+        'transform should have returned X unchanged',
+    )
+
+    # The function should have recieved X and y.
+    assert_equal(
+        args_store,
+        [X, y],
+        'Incorrect positional arguments passed to func: {args}'.format(
+            args=args_store,
+        ),
+    )
+    assert_equal(
+        kwargs_store,
+        {},
+        'Unexpected keyword arguments passed to func: {args}'.format(
+            args=kwargs_store,
+        ),
+    )
+
+
+def test_np_log():
+    X = np.arange(10).reshape((5, 2))
+
+    # Test that the numpy.log example still works.
+    np.testing.assert_array_equal(
+        FunctionTransformer(np.log).transform(X),
+        np.log(X),
+    )
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -138,7 +138,8 @@ def _yield_transformer_checks(name, Transformer):
                     'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']:
         yield check_transformer_data_not_an_array
     # these don't actually fit the data, so don't raise errors
-    if name not in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
+    if name not in ['AdditiveChi2Sampler', 'Binarizer',
+                    'FunctionTransformer', 'Normalizer']:
         # basic tests
         yield check_transformer_general
         yield check_transformers_unfitted