diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index ff5708efeca7c..b310ba3ea3dbf 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1104,6 +1104,7 @@ See the :ref:`metrics` section of the user guide for further details. :template: class.rst preprocessing.Binarizer + preprocessing.FunctionTransformer preprocessing.Imputer preprocessing.KernelCenterer preprocessing.LabelBinarizer diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 469726dedbea7..842f071185830 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -508,3 +508,23 @@ The features of X have been transformed from :math:`(X_1, X_2, X_3)` to :math:`( Note that polynomial features are used implicitily in `kernel methods `_ (e.g., :class:`sklearn.svm.SVC`, :class:`sklearn.decomposition.KernelPCA`) when using polynomial :ref:`svm_kernels`. See :ref:`example_linear_model_plot_polynomial_interpolation.py` for Ridge regression using created polynomial features. + +Custom Transformers +=================== + +Often, you will want to convert an existing python function into a transformer +to assist in data cleaning or processing. Users may implement a transformer from +an arbitrary function with :class:`FunctionTransformer`. For example, one could +apply a log transformation in a pipeline like:: + + >>> import numpy as np + >>> from sklearn.preprocessing import FunctionTransformer + >>> transformer = FunctionTransformer(np.log) + >>> X = np.array([[1, 2], [3, 4]]) + >>> transformer.transform(X) + array([[ 0. , 0.69314718], + [ 1.09861229, 1.38629436]]) + +For a full code example that demonstrates using a :class:`FunctionTransformer` +to do column selection, +see :ref:`example_preprocessing_plot_function_transformer.py` diff --git a/examples/preprocessing/plot_function_transformer.py b/examples/preprocessing/plot_function_transformer.py new file mode 100644 index 0000000000000..46cff6f5a784f --- /dev/null +++ b/examples/preprocessing/plot_function_transformer.py @@ -0,0 +1,69 @@ +""" +========================================================= +Using FunctionTransformer to select columns +========================================================= + +Shows how to use a function transformer in a pipeline. If you know your +dataset's first principle component is irrelevant for a classification task, +you can use the FunctionTransformer to select all but the first column of the +PCA transformed data. +""" +import matplotlib.pyplot as plt +import numpy as np + +from sklearn.cross_validation import train_test_split +from sklearn.decomposition import PCA +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import FunctionTransformer + + +def _generate_vector(shift=0.5, noise=15): + return np.arange(1000) + (np.random.rand(1000) - shift) * noise + + +def generate_dataset(): + """ + This dataset is two lines with a slope ~ 1, where one has + a y offset of ~100 + """ + return np.vstack(( + np.vstack(( + _generate_vector(), + _generate_vector() + 100, + )).T, + np.vstack(( + _generate_vector(), + _generate_vector(), + )).T, + )), np.hstack((np.zeros(1000), np.ones(1000))) + + +def all_but_first_column(X): + return X[:, 1:] + + +def drop_first_component(X, y): + """ + Create a pipeline with PCA and the column selector and use it to + transform the dataset. + """ + pipeline = make_pipeline( + PCA(), FunctionTransformer(all_but_first_column), + ) + X_train, X_test, y_train, y_test = train_test_split(X, y) + pipeline.fit(X_train, y_train) + return pipeline.transform(X_test), y_test + + +if __name__ == '__main__': + X, y = generate_dataset() + plt.scatter(X[:, 0], X[:, 1], c=y, s=50) + plt.show() + X_transformed, y_transformed = drop_first_component(*generate_dataset()) + plt.scatter( + X_transformed[:, 0], + np.zeros(len(X_transformed)), + c=y_transformed, + s=50, + ) + plt.show() diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 380f5704a963e..2ffc645857c52 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -3,6 +3,8 @@ normalization, binarization and imputation methods. """ +from .function_transformer import FunctionTransformer + from .data import Binarizer from .data import KernelCenterer from .data import MinMaxScaler @@ -28,8 +30,10 @@ from .imputation import Imputer + __all__ = [ 'Binarizer', + 'FunctionTransformer', 'Imputer', 'KernelCenterer', 'LabelBinarizer', diff --git a/sklearn/preprocessing/function_transformer.py b/sklearn/preprocessing/function_transformer.py new file mode 100644 index 0000000000000..c814b14bf377e --- /dev/null +++ b/sklearn/preprocessing/function_transformer.py @@ -0,0 +1,66 @@ +from ..base import BaseEstimator, TransformerMixin +from ..utils import check_array + + +def _identity(X): + """The identity function. + """ + return X + + +class FunctionTransformer(BaseEstimator, TransformerMixin): + """Constructs a transformer from an arbitrary callable. + + A FunctionTransformer forwards its X (and optionally y) arguments to a + user-defined function or function object and returns the result of this + function. This is useful for stateless transformations such as taking the + log of frequencies, doing custom scaling, etc. + + A FunctionTransformer will not do any checks on its function's output. + + Note: If a lambda is used as the function, then the resulting + transformer will not be pickleable. + + Parameters + ---------- + func : callable, optional default=None + The callable to use for the transformation. This will be passed + the same arguments as transform, with args and kwargs forwarded. + If func is None, then func will be the identity function. + + validate : bool, optional default=True + Indicate that the input X array should be checked before calling + func. If validate is false, there will be no input validation. + If it is true, then X will be converted to a 2-dimensional NumPy + array or sparse matrix. If this conversion is not possible or X + contains NaN or infinity, an exception is raised. + + accept_sparse : boolean, optional + Indicate that func accepts a sparse matrix as input. If validate is + False, this has no effect. Otherwise, if accept_sparse is false, + sparse matrix inputs will cause an exception to be raised. + + pass_y: bool, optional default=False + Indicate that transform should forward the y argument to the + inner callable. + + """ + def __init__(self, func=None, validate=True, + accept_sparse=False, pass_y=False): + self.func = func + self.validate = validate + self.accept_sparse = accept_sparse + self.pass_y = pass_y + + def fit(self, X, y=None): + if self.validate: + check_array(X, self.accept_sparse) + return self + + def transform(self, X, y=None): + if self.validate: + X = check_array(X, self.accept_sparse) + func = self.func if self.func is not None else _identity + + + return func(X, *((y,) if self.pass_y else ())) diff --git a/sklearn/preprocessing/tests/test_function_transformer.py b/sklearn/preprocessing/tests/test_function_transformer.py new file mode 100644 index 0000000000000..e02e7580ce5eb --- /dev/null +++ b/sklearn/preprocessing/tests/test_function_transformer.py @@ -0,0 +1,83 @@ +from nose.tools import assert_equal +import numpy as np + +from ..function_transformer import FunctionTransformer + + +def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X): + def _func(X, *args, **kwargs): + args_store.append(X) + args_store.extend(args) + kwargs_store.update(kwargs) + return func(X) + + return _func + + +def test_delegate_to_func(): + # (args|kwargs)_store will hold the positional and keyword arguments + # passed to the function inside the FunctionTransformer. + args_store = [] + kwargs_store = {} + X = np.arange(10).reshape((5, 2)) + np.testing.assert_array_equal( + FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X), + X, + 'transform should have returned X unchanged', + ) + + # The function should only have recieved X. + assert_equal( + args_store, + [X], + 'Incorrect positional arguments passed to func: {args}'.format( + args=args_store, + ), + ) + assert_equal( + kwargs_store, + {}, + 'Unexpected keyword arguments passed to func: {args}'.format( + args=kwargs_store, + ), + ) + + # reset the argument stores. + args_store[:] = [] # python2 compatible inplace list clear. + kwargs_store.clear() + y = object() + + np.testing.assert_array_equal( + FunctionTransformer( + _make_func(args_store, kwargs_store), + pass_y=True, + ).transform(X, y), + X, + 'transform should have returned X unchanged', + ) + + # The function should have recieved X and y. + assert_equal( + args_store, + [X, y], + 'Incorrect positional arguments passed to func: {args}'.format( + args=args_store, + ), + ) + assert_equal( + kwargs_store, + {}, + 'Unexpected keyword arguments passed to func: {args}'.format( + args=kwargs_store, + ), + ) + + +def test_np_log(): + X = np.arange(10).reshape((5, 2)) + + # Test that the numpy.log example still works. + np.testing.assert_array_equal( + FunctionTransformer(np.log).transform(X), + np.log(X), + ) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 63c396203106a..3255d9c58790a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -138,7 +138,8 @@ def _yield_transformer_checks(name, Transformer): 'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']: yield check_transformer_data_not_an_array # these don't actually fit the data, so don't raise errors - if name not in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']: + if name not in ['AdditiveChi2Sampler', 'Binarizer', + 'FunctionTransformer', 'Normalizer']: # basic tests yield check_transformer_general yield check_transformers_unfitted