diff --git a/examples/preprocessing/plot_count_featurizer.py b/examples/preprocessing/plot_count_featurizer.py new file mode 100644 index 0000000000000..c0ecafd6a8f8d --- /dev/null +++ b/examples/preprocessing/plot_count_featurizer.py @@ -0,0 +1,138 @@ +""" +========================================================= +Using CountFeaturizer to featurize frequencies +========================================================= + +Shows how to use CountFeaturizer to transform some categorical variables +into a frequency feature. CountFeaturizer can often be used to reduce +training time, classification time, and classification error. +""" +from __future__ import print_function +import matplotlib.pyplot as plt +import numpy as np +import time +from sklearn.preprocessing import FunctionTransformer +from sklearn.preprocessing.data import CountFeaturizer +from sklearn.preprocessing.data import OneHotEncoder +from collections import OrderedDict +from sklearn.datasets import make_classification +from sklearn.ensemble import RandomForestClassifier +from sklearn.pipeline import make_pipeline +from sklearn.pipeline import FeatureUnion + +RANDOM_STATE = 30 + +n_datapoints = 1000 +n_informative = 30 +n_features = 30 +n_redundant = 0 + +# Generate a binary classification dataset. +X, y = make_classification(n_samples=n_datapoints, n_features=n_features, + n_clusters_per_class=1, n_informative=n_informative, + n_redundant=n_redundant, random_state=RANDOM_STATE) + +# only make these selected features "categorical" +discretized_features = [0, 1] +non_discretized_features = \ + list(set(range(n_features)) - set(discretized_features)) +non_discretized_features_count = \ + list(set(range(n_features + 1)) - set(discretized_features)) + + +def select_non_discrete(X, count=False): + """Selects the non-discrete features.""" + if count: + return X[:, non_discretized_features_count] + return X[:, non_discretized_features] + + +def select_discrete(X): + """Selects the discrete features.""" + return X[:, discretized_features] + + +def process_discrete(X): + """Processes discrete features to make them categorical.""" + for feature in discretized_features: + X_transform_col = (X[:, feature]).astype(int) + col_min = min(np.amin(X_transform_col), 0) + X[:, feature] = X_transform_col - col_min + return X + + +time_start = time.time() +pipeline_cf = make_pipeline( + FunctionTransformer(func=process_discrete), + CountFeaturizer(inclusion=discretized_features), + FunctionTransformer(func=lambda X: select_non_discrete(X, count=True))) +X_count = pipeline_cf.fit_transform(X, y=y) +cf_time_preprocessing = time.time() - time_start + +time_start = time.time() +pipeline_ohe_nd = make_pipeline(FunctionTransformer(func=select_non_discrete)) +pipeline_ohe_d = make_pipeline( + FunctionTransformer(func=select_discrete), + FunctionTransformer(func=process_discrete), + OneHotEncoder()) +pipeline_ohe = FeatureUnion( + [("discrete", pipeline_ohe_d), ("nondiscrete", pipeline_ohe_nd)]) +X_one_hot = pipeline_ohe.fit_transform(X, y=y).todense() +ohe_time_preprocessing = time.time() - time_start + + +def get_classifier(): + return RandomForestClassifier(warm_start=True, max_features="log2", + oob_score=True, random_state=RANDOM_STATE) + + +clf = get_classifier() +labels = ["CountFeaturizer + RandomForestClassifier", + "OneHotEncoder + RandomForestClassifier", + "Only RandomForestClassifier"] +error_rate = OrderedDict((label, []) for label in labels) + +min_estimators = (15 * n_datapoints // 500) +max_estimators = (175 * n_datapoints // 500) +time_start = time.time() + +for i in range(min_estimators, max_estimators + 1): + clf.set_params(n_estimators=i) + clf.fit(X_count, y) + oob_error = 1 - clf.oob_score_ + error_rate[labels[0]].append((i, oob_error)) + +print("Time taken on CountFeaturizer: ", + (time.time() - time_start + cf_time_preprocessing)) +clf = get_classifier() +time_start = time.time() + +for i in range(min_estimators, max_estimators + 1): + clf.set_params(n_estimators=i) + clf.fit(X_one_hot, y) + oob_error = 1 - clf.oob_score_ + error_rate[labels[1]].append((i, oob_error)) + +print("Time taken on OneHotEncoder: ", + (time.time() - time_start + ohe_time_preprocessing)) +clf = get_classifier() +time_start = time.time() + +for i in range(min_estimators, max_estimators + 1): + clf.set_params(n_estimators=i) + clf.fit(X, y) + oob_error = 1 - clf.oob_score_ + error_rate[labels[2]].append((i, oob_error)) + +print("Time taken on No Encoding: ", (time.time() - time_start)) + +# Generate the "OOB error rate" vs. "n_estimators" plot. +for label, clf_err in error_rate.items(): + xs, ys = zip(*clf_err) + plt.plot(xs, ys, label=label) + +plt.xlim(min_estimators, max_estimators) +plt.xlabel("n_estimators") +plt.ylabel("OOB error rate") +plt.legend(loc="upper right") +plt.show() diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bbd2fae10c0ec..68c4ba146659d 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -9,7 +9,9 @@ from __future__ import division +from collections import defaultdict from itertools import chain, combinations +import functools import numbers import warnings from itertools import combinations_with_replacement as combinations_w_r @@ -22,6 +24,7 @@ from ..externals import six from ..externals.six import string_types from ..utils import check_array +from ..utils import check_X_y from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var from ..utils.fixes import _argmax @@ -2866,6 +2869,229 @@ def power_transform(X, method='box-cox', standardize=True, copy=True): return pt.fit_transform(X) +def _get_nested_counter(remaining, y_dim, inclusion_size): + "A nested dictionary with 'remaining' layers and a 2D array at the end" + if remaining == 1: + return np.zeros((y_dim, inclusion_size)) + return defaultdict( + functools.partial( + _get_nested_counter, remaining - 1, y_dim, inclusion_size)) + + +class CountFeaturizer(BaseEstimator, TransformerMixin): + """Adds a feature representing each feature value's count in training + + Specifically, for each data point 'X_i' in the dataset 'X', it will add in + a new set of columns 'count_X_i' to the end of 'X_i' where 'count_X_i' is the + number of occurences of 'X_i' in the dataset 'X' given the equality indicator + 'inclusion'. + + If a 'y' argument is given during the fit step, then the + count in the transform step will be conditional on the 'y'. + The number of columns added will be the number of different values 'y' can + take on. + + This preprocessing step is useful when the number of occurences + of a particular piece of data is helpful in computing the prediction. + + Parameters + ---------- + inclusion : 'all', 'each', list, or numpy.ndarray + The inclusion criteria for counting + + - 'all' (default) : Every feature is concatenated and counted + - 'each' : Each feature will have its own set of counts + - list of indices : Only the given list of features is + concatenated and counted + - list of lists of indices : The given list of lists of features is + concatenated and counted, but each list in the + list of lists has its own set of counts + + Attributes + ---------- + count_cache_ : defaultdict(int) + The counts of each example learned during 'fit' + + classes_ : list of (index, y) tuples + An enumerated set of all unique values 'y' can have + + n_input_features_ : int + The number of columns of 'X' learned during 'fit' + We use this to compare to the number of columns of 'X' + during transform to make sure that the transformation is compatible + + n_output_features_ : int + The number of columns of 'y' learned during 'fit' + If 0, then the fit is not conditional on the y given + + Examples + -------- + Given a dataset with two features and four samples, we let the transformer + find the number of occurences of each data point in the dataset + Note how the first column duplicates the input data, the second column + corresponds to the count of ``y=0`` and the third column corresponds to + the count of ``y=1``. + + >>> from sklearn.preprocessing.data import CountFeaturizer + >>> X = [[0], [0], [0], [0], [1], [1], [1], [1]] + >>> y = [0, 1, 1, 1, 0, 0, 0, 0] + >>> cf = CountFeaturizer().fit(X, y) + >>> cf.transform(X) # doctest: +NORMALIZE_WHITESPACE + array([[ 0., 1., 3.], + [ 0., 1., 3.], + [ 0., 1., 3.], + [ 0., 1., 3.], + [ 1., 4., 0.], + [ 1., 4., 0.], + [ 1., 4., 0.], + [ 1., 4., 0.]]) + + See also + -------- + https://blogs.technet.microsoft.com/machinelearning/2015/02/17/big-learning-made-easy-with-counts/ # noqa + https://msdn.microsoft.com/en-us/library/azure/dn913056.aspx + """ + def __init__(self, inclusion='all'): + self.inclusion = inclusion + + @staticmethod + def _valid_data_type(type_check): + return isinstance(type_check, (np.ndarray, list)) + + @staticmethod + def _check_inclusion(inclusion, n_input_features=1): + if inclusion is None: + raise ValueError("Inclusion cannot be none") + if isinstance(inclusion, str) and inclusion == "all": + return np.array([range(n_input_features)]) + elif isinstance(inclusion, str) and inclusion == "each": + return np.array([[i] for i in range(n_input_features)]) + elif CountFeaturizer._valid_data_type(inclusion): + if len(inclusion) == 0: + raise ValueError("Inclusion size must not be 0") + if CountFeaturizer._valid_data_type(inclusion[0]): + return inclusion + else: + return [inclusion] + else: + raise ValueError("Illegal value for inclusion parameter") + + def fit(self, X, y=None): + """Fits the CountFeaturizer to X, y + + Stores the counts for each example X, conditional on y + Both X and y must be appropriately reshaped to a 2D list + + Parameters + ---------- + X : array + The data set to learn the counts from, conditional to 'y' + X must not be 1 dimensional + + y : array-like, optional + If provided, a separate column is output for each value of 'y', + counting the occurences of 'X' conditioned on that 'y' value + + Returns + ------- + self + """ + + if y is not None: + X, y = check_X_y(X, y, multi_output=True) + if len(y.shape) == 1: + self.n_output_features_ = 1 + y = np.reshape(y, (-1, 1)) + else: + self.n_output_features_ = len(y[0]) + else: + X = check_array(X) + y = np.zeros((len(X), 1)) + self.n_output_features_ = 1 + + self.n_input_features_ = len(X[0]) + inclusion_used = \ + CountFeaturizer._check_inclusion( + self.inclusion, n_input_features=self.n_input_features_) + len_data = len(X) + len_inclusion = len(inclusion_used) + self.count_cache_ = \ + _get_nested_counter(3, self.n_output_features_, len_inclusion) + classes_unsorted = [set() for i in range(self.n_output_features_)] + + for inclusion_i in range(len_inclusion): + for i in range(len_data): + X_key = tuple(X[i].take(inclusion_used[inclusion_i])) + for j in range(self.n_output_features_): + y_key = y[i, j] + self.count_cache_[X_key][y_key][j, inclusion_i] += 1 + classes_unsorted[j].add(y_key) + + self.classes_ = \ + [list(enumerate(sorted(ys))) for ys in classes_unsorted] + + return self + + def transform(self, X): + """Transforms X to include the counts learned during 'fit' + + Augments 'X' with a new column containing the counts of each example, + conditional on 'y'. + + Parameters + ---------- + X : array + The 'X' that we augment with count columns + 'X' must not be 1 dimensional + + Returns + ------- + transformed : numpy.ndarray + The transformed input + + Notes + ----- + The data returned from the transformation will always be a + numpy.ndarray + """ + + check_is_fitted(self, ['count_cache_', 'n_input_features_']) + + X = check_array(X) + len_data = len(X) + len_classes = 0 + for ys in self.classes_: + len_classes += len(ys) + + num_features = len(X[0]) + if self.n_input_features_ != num_features: + raise ValueError("Dimensions mismatch in X during transform") + inclusion_used = \ + CountFeaturizer._check_inclusion( + self.inclusion, n_input_features=self.n_input_features_) + + # the number of added cols is the number of unique y vals + # multiplied by the number of different inclusion lists + num_added_cols = len_classes * len(inclusion_used) + transformed = np.zeros((len_data, num_features + num_added_cols)) + transformed[:, :-num_added_cols] = X + + col_offset_inclusion = 0 + for inclusion_i in range(len(inclusion_used)): + col_offset_y = 0 + col_offset_inclusion = inclusion_i * len_classes + for j in range(self.n_output_features_): + for y_ind, y_key in self.classes_[j]: + for i in range(len_data): + X_key = tuple(X[i].take(inclusion_used[inclusion_i])) + transformed[i, num_features + y_ind + + col_offset_y + col_offset_inclusion] = \ + self.count_cache_[X_key][y_key][j, inclusion_i] + col_offset_y += len(self.classes_[j]) + + return transformed + + class CategoricalEncoder(BaseEstimator, TransformerMixin): """Encode categorical features as a numeric array. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f4358e48fc0b8..f88c2871a1e36 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -44,6 +44,7 @@ from sklearn.preprocessing.data import normalize from sklearn.preprocessing.data import OneHotEncoder from sklearn.preprocessing.data import CategoricalEncoder +from sklearn.preprocessing.data import CountFeaturizer from sklearn.preprocessing.data import StandardScaler from sklearn.preprocessing.data import scale from sklearn.preprocessing.data import MinMaxScaler @@ -2377,3 +2378,190 @@ def test_power_transformer_lambda_zero(): pt.lambdas_ = np.array([0]) X_trans = pt.transform(X) assert_array_almost_equal(pt.inverse_transform(X_trans), X) + + +def test_count_featurizer(): + # test count featurizer fit-transform on a very standard data + + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + cf_1 = CountFeaturizer() + assert_array_equal( + cf_1.fit_transform(X), + np.array([[0, 2, 1, 1], [1, 0, 3, 1], [1, 0, 2, 2], [1, 0, 2, 2]])) + + +def test_count_featurizer_diff_fit(): + # test count featurizer where fit is trained on something different + # than what we are transforming + + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + X2 = np.array([[0, 2, 1], [0, 2, 1], [0, 2, 1], [1, 0, 2]]) + cf_1 = CountFeaturizer().fit(X) + assert_array_equal( + cf_1.transform(X2), + np.array([[0, 2, 1, 1], [0, 2, 1, 1], [0, 2, 1, 1], [1, 0, 2, 2]])) + + +def test_count_featurizer_diff_fit_y(): + # test count featurizer where fit is trained on something different + # than what we are transforming with a y in fit + + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + X2 = np.array([[0, 2, 1], [0, 2, 1], [0, 2, 1], [1, 0, 2]]) + y = np.array([0, 0, 1, 0]) + cf_1 = CountFeaturizer().fit(X, y=y) + assert_array_equal( + cf_1.transform(X2), + np.array([[0, 2, 1, 1, 0], [0, 2, 1, 1, 0], + [0, 2, 1, 1, 0], [1, 0, 2, 1, 1]])) + + +def test_count_featurizer_diff_fit_multi_y(): + # test count featurizer where fit is trained on something different + # than what we are transforming, now with multi-dimensional y + + X = np.array([[0], [0], [1], [1]]) + X2 = np.array([[1], [1], [0], [0]]) + y = np.array([[0, 1], [0, 1], [0, 0], [0, 1]]) + cf = CountFeaturizer().fit(X, y=y) + assert_array_equal( + cf.transform(X2), + np.array([[1, 2, 1, 1], [1, 2, 1, 1], + [0, 2, 0, 2], [0, 2, 0, 2]])) + + X2 = np.array([[1], [1], [1], [1]]) + cf = CountFeaturizer().fit(X, y=y) + assert_array_equal( + cf.transform(X2), + np.array([[1, 2, 1, 1], [1, 2, 1, 1], + [1, 2, 1, 1], [1, 2, 1, 1]])) + + +def test_count_featurizer_inclusion(): + # test with the inclusion parameter set + + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + X2 = np.array([[0, 2, 1], [0, 2, 3], [1, 0, 5], [1, 1, 5]]) + cf_inclusion_1 = CountFeaturizer(inclusion=[0]) + cf_inclusion_2 = CountFeaturizer(inclusion=[0, 1, 2]) + cf_inclusion_3 = CountFeaturizer(inclusion=[0, 1]) + assert_array_equal( + cf_inclusion_1.fit_transform(X), + np.array([[0, 2, 1, 1], [1, 0, 3, 3], [1, 0, 2, 3], [1, 0, 2, 3]])) + + assert_array_equal( + cf_inclusion_2.fit_transform(X), + np.array([[0, 2, 1, 1], [1, 0, 3, 1], [1, 0, 2, 2], [1, 0, 2, 2]])) + + assert_array_equal( + cf_inclusion_3.fit_transform(X2), + np.array([[0, 2, 1, 2], [0, 2, 3, 2], [1, 0, 5, 1], [1, 1, 5, 1]])) + + +def test_count_featurizer_y(): + # test with the y parameter set + + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + y = np.array([0, 0, 1, 0]) + cf_1 = CountFeaturizer() + assert_array_equal( + cf_1.fit_transform(X, y=y), + np.array([[0, 2, 1, 1, 0], [1, 0, 3, 1, 0], + [1, 0, 2, 1, 1], [1, 0, 2, 1, 1]])) + + +def test_count_featurizer_y_inclusion(): + # test with the inclusion and y parameter set + + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + y = np.array([0, 0, 1, 0]) + cf_inclusion_1 = CountFeaturizer(inclusion=[0]) + cf_inclusion_2 = CountFeaturizer(inclusion=[0, 1, 2]) + assert_array_equal( + cf_inclusion_1.fit_transform(X, y=y), + np.array([[0, 2, 1, 1, 0], [1, 0, 3, 2, 1], + [1, 0, 2, 2, 1], [1, 0, 2, 2, 1]])) + + assert_array_equal( + cf_inclusion_2.fit_transform(X, y=y), + np.array([[0, 2, 1, 1, 0], [1, 0, 3, 1, 0], + [1, 0, 2, 1, 1], [1, 0, 2, 1, 1]])) + + +def test_count_featurizer_multi_y(): + # test with multi-dimensional y parameter + + X = np.array([[0], [0], [1], [1]]) + y = np.array([[0, 1], [0, 1], [0, 0], [0, 1]]) + cf = CountFeaturizer() + assert_array_equal( + cf.fit_transform(X, y=y), + np.array([[0, 2, 0, 2], [0, 2, 0, 2], + [1, 2, 1, 1], [1, 2, 1, 1]])) + + +def test_count_featurizer_multi_inclusion(): + # test with 2D inclusion + + X = np.array([[0, 0], [0, 0], [1, 0], [1, 0]]) + y = np.array([[0], [0], [0], [1]]) + cf = CountFeaturizer(inclusion=np.array([[0], [1]])) + assert_array_equal( + cf.fit_transform(X, y=y), + np.array([[0, 0, 2, 0, 3, 1], [0, 0, 2, 0, 3, 1], + [1, 0, 1, 1, 3, 1], [1, 0, 1, 1, 3, 1]])) + + # each should be the same as [[0], [1]] + cf = CountFeaturizer(inclusion="each") + assert_array_equal( + cf.fit_transform(X, y=y), + np.array([[0, 0, 2, 0, 3, 1], [0, 0, 2, 0, 3, 1], + [1, 0, 1, 1, 3, 1], [1, 0, 1, 1, 3, 1]])) + + cf = CountFeaturizer(inclusion=[[0]]) + assert_array_equal( + cf.fit_transform(X, y=y), + np.array([[0, 0, 2, 0], [0, 0, 2, 0], + [1, 0, 1, 1], [1, 0, 1, 1]])) + + cf = CountFeaturizer(inclusion="each") + y = np.array([[0, 0], [0, 1], [0, 2], [1, 3]]) + assert_array_equal( + cf.fit_transform(X, y=y), + np.array([[0, 0, 2, 0, 1, 1, 0, 0, 3, 1, 1, 1, 1, 1], + [0, 0, 2, 0, 1, 1, 0, 0, 3, 1, 1, 1, 1, 1], + [1, 0, 1, 1, 0, 0, 1, 1, 3, 1, 1, 1, 1, 1], + [1, 0, 1, 1, 0, 0, 1, 1, 3, 1, 1, 1, 1, 1]])) + + +def test_count_featurizer_none_inclusion(): + # test with None as the inclusion criteria, should report error + + cf = CountFeaturizer(inclusion=None) + none_inclusion_msg = "Inclusion cannot be none" + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + assert_raise_message(ValueError, none_inclusion_msg, + cf.fit_transform, X) + + +def test_count_featurizer_empty_inclusion(): + # test with empty inclusion criteria + + cf = CountFeaturizer(inclusion=[]) + zero_inclusion_msg = "Inclusion size must not be 0" + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + assert_raise_message(ValueError, zero_inclusion_msg, + cf.fit_transform, X) + + +def test_count_featurizer_illegal_inclusion(): + # test with illegal inclusion criteria + + cf = CountFeaturizer(inclusion="foo bar baz") + cf2 = CountFeaturizer(inclusion=0) + illegal_inclusion_msg = "Illegal value for inclusion parameter" + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2], [1, 0, 2]]) + assert_raise_message(ValueError, illegal_inclusion_msg, + cf.fit_transform, X) + assert_raise_message(ValueError, illegal_inclusion_msg, + cf2.fit_transform, X)