Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/modules/classes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,7 @@ See the :ref:`metrics` section of the user guide for further details.
:template: class.rst

preprocessing.Binarizer
preprocessing.FunctionTransformer
preprocessing.Imputer
preprocessing.KernelCenterer
preprocessing.LabelBinarizer
Expand Down
20 changes: 20 additions & 0 deletions doc/modules/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -508,3 +508,23 @@ The features of X have been transformed from :math:`(X_1, X_2, X_3)` to :math:`(
Note that polynomial features are used implicitily in `kernel methods <http://en.wikipedia.org/wiki/Kernel_method>`_ (e.g., :class:`sklearn.svm.SVC`, :class:`sklearn.decomposition.KernelPCA`) when using polynomial :ref:`svm_kernels`.

See :ref:`example_linear_model_plot_polynomial_interpolation.py` for Ridge regression using created polynomial features.

Custom Transformers
===================

Often, you will want to convert an existing python function into a transformer
to assist in data cleaning or processing. Users may implement a transformer from
an arbitrary function with :class:`FunctionTransformer`. For example, one could
apply a log transformation in a pipeline like::
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a log1p would make for a more useful example (since it never produces any infinities if fed frequency data). I'll fix that in master.


>>> import numpy as np
>>> from sklearn.preprocessing import FunctionTransformer
>>> transformer = FunctionTransformer(np.log)
>>> X = np.array([[1, 2], [3, 4]])
>>> transformer.transform(X)
array([[ 0. , 0.69314718],
[ 1.09861229, 1.38629436]])

For a full code example that demonstrates using a :class:`FunctionTransformer`
to do column selection,
see :ref:`example_preprocessing_plot_function_transformer.py`
69 changes: 69 additions & 0 deletions examples/preprocessing/plot_function_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""
=========================================================
Using FunctionTransformer to select columns
=========================================================

Shows how to use a function transformer in a pipeline. If you know your
dataset's first principle component is irrelevant for a classification task,
you can use the FunctionTransformer to select all but the first column of the
PCA transformed data.
"""
import matplotlib.pyplot as plt
import numpy as np

from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer


def _generate_vector(shift=0.5, noise=15):
return np.arange(1000) + (np.random.rand(1000) - shift) * noise


def generate_dataset():
"""
This dataset is two lines with a slope ~ 1, where one has
a y offset of ~100
"""
return np.vstack((
np.vstack((
_generate_vector(),
_generate_vector() + 100,
)).T,
np.vstack((
_generate_vector(),
_generate_vector(),
)).T,
)), np.hstack((np.zeros(1000), np.ones(1000)))


def all_but_first_column(X):
return X[:, 1:]


def drop_first_component(X, y):
"""
Create a pipeline with PCA and the column selector and use it to
transform the dataset.
"""
pipeline = make_pipeline(
PCA(), FunctionTransformer(all_but_first_column),
)
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipeline.fit(X_train, y_train)
return pipeline.transform(X_test), y_test


if __name__ == '__main__':
X, y = generate_dataset()
plt.scatter(X[:, 0], X[:, 1], c=y, s=50)
plt.show()
X_transformed, y_transformed = drop_first_component(*generate_dataset())
plt.scatter(
X_transformed[:, 0],
np.zeros(len(X_transformed)),
c=y_transformed,
s=50,
)
plt.show()
4 changes: 4 additions & 0 deletions sklearn/preprocessing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
normalization, binarization and imputation methods.
"""

from .function_transformer import FunctionTransformer

from .data import Binarizer
from .data import KernelCenterer
from .data import MinMaxScaler
Expand All @@ -28,8 +30,10 @@

from .imputation import Imputer


__all__ = [
'Binarizer',
'FunctionTransformer',
'Imputer',
'KernelCenterer',
'LabelBinarizer',
Expand Down
66 changes: 66 additions & 0 deletions sklearn/preprocessing/function_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from ..base import BaseEstimator, TransformerMixin
from ..utils import check_array


def _identity(X):
"""The identity function.
"""
return X


class FunctionTransformer(BaseEstimator, TransformerMixin):
"""Constructs a transformer from an arbitrary callable.

A FunctionTransformer forwards its X (and optionally y) arguments to a
user-defined function or function object and returns the result of this
function. This is useful for stateless transformations such as taking the
log of frequencies, doing custom scaling, etc.

A FunctionTransformer will not do any checks on its function's output.

Note: If a lambda is used as the function, then the resulting
transformer will not be pickleable.

Parameters
----------
func : callable, optional default=None
The callable to use for the transformation. This will be passed
the same arguments as transform, with args and kwargs forwarded.
If func is None, then func will be the identity function.

validate : bool, optional default=True
Indicate that the input X array should be checked before calling
func. If validate is false, there will be no input validation.
If it is true, then X will be converted to a 2-dimensional NumPy
array or sparse matrix. If this conversion is not possible or X
contains NaN or infinity, an exception is raised.

accept_sparse : boolean, optional
Indicate that func accepts a sparse matrix as input. If validate is
False, this has no effect. Otherwise, if accept_sparse is false,
sparse matrix inputs will cause an exception to be raised.

pass_y: bool, optional default=False
Indicate that transform should forward the y argument to the
inner callable.

"""
def __init__(self, func=None, validate=True,
accept_sparse=False, pass_y=False):
self.func = func
self.validate = validate
self.accept_sparse = accept_sparse
self.pass_y = pass_y

def fit(self, X, y=None):
if self.validate:
check_array(X, self.accept_sparse)
return self

def transform(self, X, y=None):
if self.validate:
X = check_array(X, self.accept_sparse)
func = self.func if self.func is not None else _identity


return func(X, *((y,) if self.pass_y else ()))
83 changes: 83 additions & 0 deletions sklearn/preprocessing/tests/test_function_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from nose.tools import assert_equal
import numpy as np

from ..function_transformer import FunctionTransformer
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test should use public import (full path). In fact, I think the module should be called _function_transformer so people don't import it directly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point.



def _make_func(args_store, kwargs_store, func=lambda X, *a, **k: X):
def _func(X, *args, **kwargs):
args_store.append(X)
args_store.extend(args)
kwargs_store.update(kwargs)
return func(X)

return _func


def test_delegate_to_func():
# (args|kwargs)_store will hold the positional and keyword arguments
# passed to the function inside the FunctionTransformer.
args_store = []
kwargs_store = {}
X = np.arange(10).reshape((5, 2))
np.testing.assert_array_equal(
FunctionTransformer(_make_func(args_store, kwargs_store)).transform(X),
X,
'transform should have returned X unchanged',
)

# The function should only have recieved X.
assert_equal(
args_store,
[X],
'Incorrect positional arguments passed to func: {args}'.format(
args=args_store,
),
)
assert_equal(
kwargs_store,
{},
'Unexpected keyword arguments passed to func: {args}'.format(
args=kwargs_store,
),
)

# reset the argument stores.
args_store[:] = [] # python2 compatible inplace list clear.
kwargs_store.clear()
y = object()

np.testing.assert_array_equal(
FunctionTransformer(
_make_func(args_store, kwargs_store),
pass_y=True,
).transform(X, y),
X,
'transform should have returned X unchanged',
)

# The function should have recieved X and y.
assert_equal(
args_store,
[X, y],
'Incorrect positional arguments passed to func: {args}'.format(
args=args_store,
),
)
assert_equal(
kwargs_store,
{},
'Unexpected keyword arguments passed to func: {args}'.format(
args=kwargs_store,
),
)


def test_np_log():
X = np.arange(10).reshape((5, 2))

# Test that the numpy.log example still works.
np.testing.assert_array_equal(
FunctionTransformer(np.log).transform(X),
np.log(X),
)
3 changes: 2 additions & 1 deletion sklearn/utils/estimator_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def _yield_transformer_checks(name, Transformer):
'PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD']:
yield check_transformer_data_not_an_array
# these don't actually fit the data, so don't raise errors
if name not in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
if name not in ['AdditiveChi2Sampler', 'Binarizer',
'FunctionTransformer', 'Normalizer']:
# basic tests
yield check_transformer_general
yield check_transformers_unfitted
Expand Down