From 38537ed2483ae948e481c3c05613e22765e803cb Mon Sep 17 00:00:00 2001 From: Arjun-Jauhari Date: Tue, 28 Mar 2017 12:38:40 -0400 Subject: [PATCH 01/14] Rebase and commit the OrdinalEncoder implementation --- sklearn/preprocessing/data.py | 218 ++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index b4549e09e6291..7ecaf4d717dcc 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2873,3 +2873,221 @@ def inverse_transform(self, X): X_tr[mask, idx] = None return X_tr + + +class OrdinalEncoder(BaseEstimator, TransformerMixin): + """Encode ordinal integer features using a unary scheme. + + The input to this transformer should be a matrix of integers, denoting + the values taken on by ordinal (discrete) features. The output will be + a matrix where all the columns with index lower than feature value will + be active. It is assumed that input features take on values in the range + [0, n_values). + + This encoding is needed for feeding ordinal data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_values : 'auto', int or array of ints + Number of values per feature. + + - 'auto' : determine value range from training data. + - int : number of ordinal values per feature. + Each feature value should be in ``range(n_values)`` + - array : ``n_values[i]`` is the number of ordinal values in + ``X[:, i]``. Each feature value should be + in ``range(n_values[i])`` + + ordinal_features : "all" or array of indices or mask + Specify what features are treated as ordinal. + + - 'all' (default): All features are treated as ordinal. + - array of indices: Array of ordinal feature indices. + - mask: Array of length n_features and with dtype=bool. + + Non-ordinal features are always stacked to the right of the matrix. + + dtype : number type, default=np.float + Desired dtype of output. + + sparse : boolean, default=True + Will return sparse matrix if set True else will return an array. + + handle_unknown : str, 'error' or 'ignore' + Whether to raise an error or ignore if a unknown ordinal feature is + present during transform. + + Attributes + ---------- + feature_indices_ : array of shape (n_features,) + Indices to feature ranges. + Feature ``i`` in the original data is mapped to features + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + + n_values_ : array of shape (n_features,) + Maximum number of values per feature. + + Examples + -------- + Given a dataset with three features and four samples, we let the encoder + find the maximum value per feature and transform the data to a binary + Ordinal encoding. + + >>> from sklearn.preprocessing import OrdinalEncoder + >>> enc = OrdinalEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ +[1, 0, 2]]) # doctest: +ELLIPSIS + OrdinalEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values='auto', ordinal_features='all', sparse=True) + >>> enc.n_values_ + array([2, 3, 4]) + >>> enc.feature_indices_ + array([0, 1, 3, 6]) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) + + See also + -------- + sklearn.feature_extraction.DictVectorizer : performs a Ordinal encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : performs an approximate Ordinal + encoding of dictionary items or strings. + sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all + fashion. + sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of + iterables and a multilabel format, e.g. a (samples x classes) binary + matrix indicating the presence of a class label. + sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 + and n_classes-1. + """ + def __init__(self, n_values="auto", ordinal_features="all", + dtype=np.float64, sparse=True, handle_unknown='error'): + self.n_values = n_values + self.ordinal_features = ordinal_features + self.dtype = dtype + self.sparse = sparse + self.handle_unknown = handle_unknown + + def fit(self, X, y=None): + """Fit OrdinalEncoder to X. + + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Input array of type int. + """ + self.fit_transform(X) + return self + + def _fit_transform(self, X): + """Assumes X contains only ordinal features.""" + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + if (isinstance(self.n_values, six.string_types) and + self.n_values == 'auto'): + n_values = np.max(X, axis=0) + 1 + elif isinstance(self.n_values, numbers.Integral): + if (np.max(X, axis=0) >= self.n_values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self.n_values) + n_values = np.empty(n_features, dtype=np.int) + n_values.fill(self.n_values) + else: + try: + n_values = np.asarray(self.n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`. Expected" + " 'auto', int or array of ints, got %r" + % self.n_values) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + + self.n_values_ = n_values + n_values = np.hstack([[0], n_values - 1]) + indices = np.cumsum(n_values) + self.feature_indices_ = indices + + column_start = np.tile(indices[:-1], n_samples) + column_end = (X + indices[:-1]).ravel() + column_indices = np.hstack([np.arange(s, e) for s, e + in zip(column_start, column_end)]) + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + X.sum(axis=1)) + data = np.ones(X.sum()) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + + return out if self.sparse else out.toarray() + + def fit_transform(self, X, y=None): + """Fit OrdinalEncoder to X, then transform X. + + Equivalent to self.fit(X).transform(X), but more convenient and more + efficient. See fit for the parameters, transform for the return value. + """ + return _transform_selected(X, self._fit_transform, + self.ordinal_features, copy=True) + + def _transform(self, X): + """Assumes X contains only ordinal features.""" + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + indices = self.feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + # We use only those ordinal features of X that are known using fit. + # i.e lesser than n_values_ using mask. + # This means, if self.handle_unknown is "ignore", the row_indices and + # col_indices corresponding to the unknown ordinal feature are + # ignored. + mask = (X < self.n_values_).ravel() + if np.any(~mask): + if self.handle_unknown not in ['error', 'ignore']: + raise ValueError("handle_unknown should be either 'error' or " + "'ignore' got %s" % self.handle_unknown) + if self.handle_unknown == 'error': + raise ValueError("unknown ordinal feature present %s " + "during transform." % X.ravel()[~mask]) + + column_start = np.tile(indices[:-1], n_samples)[mask] + column_end = (X + indices[:-1]).ravel()[mask] + column_indices = np.hstack([np.arange(s, e) for s, e + in zip(column_start, column_end)]) + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + np.where(mask.reshape(X.shape), X, + 0).sum(axis=1)) + data = np.ones(X.ravel()[mask].sum()) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + + return out if self.sparse else out.toarray() + + def transform(self, X): + """Transform X using Ordinal encoding. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + Input array of type int. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + Transformed input. + """ + return _transform_selected(X, self._transform, + self.ordinal_features, copy=True) From a85008280434461ca8eb8b00fff9c03aff1e6e88 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Sat, 24 Jun 2017 16:25:39 -0400 Subject: [PATCH 02/14] Updating name to UnaryEncoder and adding single quote in error string --- sklearn/preprocessing/__init__.py | 2 ++ sklearn/preprocessing/data.py | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 0f5054e57f608..3a24e7aec8234 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -23,6 +23,7 @@ from .data import quantile_transform from .data import OneHotEncoder from .data import CategoricalEncoder +from .data import UnaryEncoder from .data import PolynomialFeatures @@ -60,4 +61,5 @@ 'minmax_scale', 'label_binarize', 'quantile_transform', + 'UnaryEncoder' ] diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 7ecaf4d717dcc..8c24b5f3d97a5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -59,6 +59,7 @@ 'maxabs_scale', 'minmax_scale', 'quantile_transform', + 'UnaryEncoder' ] @@ -1999,7 +2000,7 @@ def _fit_transform(self, X): except (ValueError, TypeError): raise TypeError("Wrong type for parameter `n_values`. Expected" " 'auto', int or array of ints, got %r" - % type(X)) + % self.n_values) if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") @@ -2061,8 +2062,8 @@ def _transform(self, X): mask = (X < self.n_values_).ravel() if np.any(~mask): if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either error or " - "unknown got %s" % self.handle_unknown) + raise ValueError("handle_unknown should be either 'error' or " + "'ignore' got %s" % self.handle_unknown) if self.handle_unknown == 'error': raise ValueError("unknown categorical feature present %s " "during transform." % X.ravel()[~mask]) @@ -2875,7 +2876,7 @@ def inverse_transform(self, X): return X_tr -class OrdinalEncoder(BaseEstimator, TransformerMixin): +class UnaryEncoder(BaseEstimator, TransformerMixin): """Encode ordinal integer features using a unary scheme. The input to this transformer should be a matrix of integers, denoting @@ -2936,11 +2937,11 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): find the maximum value per feature and transform the data to a binary Ordinal encoding. - >>> from sklearn.preprocessing import OrdinalEncoder - >>> enc = OrdinalEncoder() + >>> from sklearn.preprocessing import UnaryEncoder + >>> enc = UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ [1, 0, 2]]) # doctest: +ELLIPSIS - OrdinalEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', n_values='auto', ordinal_features='all', sparse=True) >>> enc.n_values_ array([2, 3, 4]) @@ -2972,7 +2973,7 @@ def __init__(self, n_values="auto", ordinal_features="all", self.handle_unknown = handle_unknown def fit(self, X, y=None): - """Fit OrdinalEncoder to X. + """Fit UnaryEncoder to X. Parameters ---------- @@ -3027,7 +3028,7 @@ def _fit_transform(self, X): return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): - """Fit OrdinalEncoder to X, then transform X. + """Fit UnaryEncoder to X, then transform X. Equivalent to self.fit(X).transform(X), but more convenient and more efficient. See fit for the parameters, transform for the return value. From 661951c3ede340d3dd0517e4a7cbbfc8bd6a9a79 Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Sun, 25 Jun 2017 14:23:24 +0300 Subject: [PATCH 03/14] Merged changes from #9216 --- doc/modules/classes.rst | 1 + doc/modules/preprocessing.rst | 47 +++++++ sklearn/preprocessing/data.py | 28 ++++- sklearn/preprocessing/tests/test_data.py | 151 +++++++++++++++++++++++ sklearn/utils/testing.py | 2 +- 5 files changed, 222 insertions(+), 7 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index f1a2e973d187f..1a61edb3a2845 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1199,6 +1199,7 @@ Model validation preprocessing.Normalizer preprocessing.OneHotEncoder preprocessing.CategoricalEncoder + preprocessing.UnaryEncoder preprocessing.PolynomialFeatures preprocessing.QuantileTransformer preprocessing.RobustScaler diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 969a2921b4061..549555e141f40 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -537,6 +537,53 @@ columns for this feature will be all zeros See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. +.. _preprocessing_ordinal_features: + +Encoding ordinal features +============================= +Often categorical features have a clear ordering. For example a person could have features +``["short", "tall"]``, +``["low income", "medium income", "high income"]``, +``["elementary school graduate", "high school graduate", "some college", "college graduate"]``. +Even though these features can be ordered, we shouldn't necessarily assign scores to them, +as the difference between categories one and two is not the same as the difference +between categories two and three. + +One possibility to convert these ordinal features to features that can be used +with scikit-learn estimators is to use a unary encoding, which is +implemented in :class:`UnaryEncoder`. This estimator transforms each +ordinal feature with ``m`` possible values into ``m - 1`` binary features, where the ith +feature is active if x > i (for i = 0, ... k - 1). + +Continuing the example above:: + + >>> enc = preprocessing.UnaryEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values='auto', ordinal_features='all', sparse=True) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) + +By default, how many values each feature can take is inferred automatically from the dataset. +It is possible to specify this explicitly using the parameter ``n_values``. +There are two genders, three possible continents and four web browsers in our +dataset. +Then we fit the estimator, and transform a data point. +In the result, the first number encodes the height, the next two numbers the income level, +and the next set of three numbers the education level. + +Note that, if there is a possibilty that the training data might have missing categorical +features, one has to explicitly set ``n_values``. For example, + + >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) + >>> # Note that there are missing categorical values for the 2nd and 3rd + >>> # features + >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values=[2, 3, 4], ordinal_features='all', sparse=True) + >>> enc.transform([[1, 1, 2]]).toarray() + array([[ 1., 1., 0., 1., 1., 0.]]) + .. _imputation: Imputation of missing values diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 8c24b5f3d97a5..64db56c1fbe68 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1955,6 +1955,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): matrix indicating the presence of a class label. sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 and n_classes-1. + sklearn.preprocessing.UnaryEncoder: encodes ordinal integer features + using a unary scheme. """ def __init__(self, n_values="auto", categorical_features="all", dtype=np.float64, sparse=True, handle_unknown='error'): @@ -2881,14 +2883,14 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): The input to this transformer should be a matrix of integers, denoting the values taken on by ordinal (discrete) features. The output will be - a matrix where all the columns with index lower than feature value will - be active. It is assumed that input features take on values in the range + a sparse matrix where each column corresponds to one possible value of one + feature. It is assumed that input features take on values in the range [0, n_values). - This encoding is needed for feeding ordinal data to many scikit-learn + This encoding is needed for feeding ordinal features to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -2923,10 +2925,16 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Attributes ---------- + active_features_ : array + Indices for active features, meaning values that actually occur + in the training set. All featurs are available when n_values is + ``'auto'``. + feature_indices_ : array of shape (n_features,) Indices to feature ranges. Feature ``i`` in the original data is mapped to features from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + (and then potentially masked by `active_features_` afterwards) n_values_ : array of shape (n_features,) Maximum number of values per feature. @@ -2935,18 +2943,20 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): -------- Given a dataset with three features and four samples, we let the encoder find the maximum value per feature and transform the data to a binary - Ordinal encoding. + unary encoding. >>> from sklearn.preprocessing import UnaryEncoder >>> enc = UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ [1, 0, 2]]) # doctest: +ELLIPSIS UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values='auto', ordinal_features='all', sparse=True) + n_values='auto', ordinal_features='all', sparse=True) >>> enc.n_values_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) + >>> enc.active_features_ + array([0, 1, 2, 3, 4, 5]) >>> enc.transform([[0, 1, 1]]).toarray() array([[ 0., 1., 0., 1., 0., 0.]]) @@ -2956,6 +2966,8 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate Ordinal encoding of dictionary items or strings. + sklearn.preprocessing.OneHotEncoder: encodes categorical integer features + using a one-hot aka one-of-K scheme. sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all fashion. sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of @@ -3025,6 +3037,10 @@ def _fit_transform(self, X): shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() + if (isinstance(self.n_values, six.string_types) and + self.n_values == 'auto'): + self.active_features_ = np.arange(out.shape[1]) + return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index e715ceacfac25..34246081094fc 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -42,6 +42,7 @@ from sklearn.preprocessing.data import normalize from sklearn.preprocessing.data import OneHotEncoder from sklearn.preprocessing.data import CategoricalEncoder +from sklearn.preprocessing.data import UnaryEncoder from sklearn.preprocessing.data import StandardScaler from sklearn.preprocessing.data import scale from sklearn.preprocessing.data import MinMaxScaler @@ -2224,3 +2225,153 @@ def test_quantile_transform_valid_axis(): assert_raises_regex(ValueError, "axis should be either equal to 0 or 1" ". Got axis=2", quantile_transform, X.T, axis=2) + + +def _generate_random_features_matrix(n_values=3, size=10): + rng = np.random.RandomState(0) + X = rng.randint(n_values, size=(size, n_values)) + return X + + +def test_unary_encoder(): + X = np.arange(5).reshape(-1, 1) + enc = UnaryEncoder(5) + Xt = enc.fit_transform(X).toarray() + assert_array_equal(Xt, [[0, 0, 0, 0], # 0 + [1, 0, 0, 0], # 1 + [1, 1, 0, 0], # 2 + [1, 1, 1, 0], # 3 + [1, 1, 1, 1]]) # 4 + Xt2 = enc.transform(X).toarray() + assert_array_equal(Xt2, Xt) + + +def test_unary_encoder_stack(): + # multiple input features stack to same output + n_values = np.random.randint(2, 10) + size = np.random.randint(1, 10) + + encoder = UnaryEncoder(n_values, sparse=False) + X_multi = _generate_random_features_matrix(n_values, size) + X_multi_t = encoder.fit_transform(X_multi) + assert_equal(X_multi_t.shape, (size, n_values * (n_values - 1))) + + expected = np.hstack([encoder.fit_transform(X_multi[:, i:(i + 1)]) + for i in range(X_multi.shape[1])]) + assert_array_equal(expected, X_multi_t) + + +def test_unary_encoder_dense_sparse(): + # test dense output in comparison to sparse results. + n_values = np.random.randint(1, 10) + size = np.random.randint(1, 10) + + sparse_encoder = UnaryEncoder(n_values) + dense_encoder = UnaryEncoder(n_values, sparse=False) + + X = _generate_random_features_matrix(n_values, size) + X_trans_sparse = sparse_encoder.fit_transform(X) + X_trans_dense = dense_encoder.fit_transform(X) + + assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) + + +def test_unary_encoder_handle_unknown(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + + # Test that encoder raises error for unknown features. + encoder = UnaryEncoder(handle_unknown='error') + encoder.fit(X) + assert_raises(ValueError, encoder.transform, y) + + # Test the ignore option, ignores unknown features. + encoder = UnaryEncoder(handle_unknown='ignore') + encoder.fit(X) + assert_array_equal( + encoder.transform(y).toarray(), + np.array([[0., 1., 0., 1., 0., 0.]])) + + # Raise error if handle_unknown is neither ignore or error. + encoder = UnaryEncoder(handle_unknown='42') + encoder.fit(X) + assert_raises(ValueError, encoder.transform, y) + + +def test_unary_encoder_errors(): + n_values = np.random.randint(2, 10) + size = np.random.randint(1, 10) + delta = np.random.randint(1, 10) + + encoder = UnaryEncoder(n_values) + X = _generate_random_features_matrix(n_values, size) + encoder.fit(X) + + # test that an error is raised when different shape + larger_n_values = n_values + delta + X_too_large = _generate_random_features_matrix(larger_n_values, size) + assert_raises(ValueError, encoder.transform, X_too_large) + error_msg = ("X has different shape than during fitting." + " Expected {}, got {}.".format(n_values, larger_n_values)) + assert_raises_regex(ValueError, error_msg, encoder.transform, X_too_large) + assert_raises(ValueError, UnaryEncoder(n_values=1).fit_transform, X) + + # test that an error is raised when out of bounds + X[0][0] = n_values + delta + X_out_of_bounds = X + assert_raises(ValueError, encoder.transform, X_out_of_bounds) + error_msg = ("unknown ordinal feature present \[{}\] " + "during transform.".format(n_values + delta)) + assert_raises_regex(ValueError, error_msg, encoder.transform, + X_out_of_bounds) + + # test exception on wrong init param + assert_raises(TypeError, UnaryEncoder(n_values=np.int).fit, X) + + # test negative input to fit + encoder = UnaryEncoder() + assert_raises(ValueError, encoder.fit, [[0], [-1]]) + + # test negative input to transform + encoder.fit([[0], [1]]) + assert_raises(ValueError, encoder.transform, [[0], [-1]]) + + +def test_unary_encoder_edge_cases(): + EDGE_CASES = [ + ( + [[0], [1], [2]], + np.array([[0, 0], [1, 0], [1, 1]]), + ), + ( + [[0], [0], [1]], + np.array([[0], [0], [1]]), + ), + ( + [[0, 0], [0, 0], [0, 1]], + np.array([[0], [0], [1]]), + ), + ] + + for input_matrix, expected_matrix in EDGE_CASES: + transformed = UnaryEncoder().fit_transform(input_matrix) + assert_array_equal(transformed.toarray(), expected_matrix) + + +def test_unary_encoder_n_values_int(): + # Test UnaryEncoder's fit and transform. + n_values = np.random.randint(2, 10) + size = np.random.randint(1, 10) + delta = np.random.randint(1, 10) + + encoder_n_values = n_values + delta + unary_n_values = encoder_n_values - 1 + enc = UnaryEncoder(n_values=encoder_n_values) + + X = _generate_random_features_matrix(n_values, size) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (size, unary_n_values * len(X[0]))) + assert_array_equal( + enc.feature_indices_, + np.arange(0, unary_n_values * len(X[0]) + 1, unary_n_values) + ) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 90994b71b782b..3026451becd3d 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -523,7 +523,7 @@ def uninstall_mldata_mock(): 'TfidfVectorizer', 'IsotonicRegression', 'OneHotEncoder', 'RandomTreesEmbedding', 'CategoricalEncoder', 'FeatureHasher', 'DummyClassifier', 'DummyRegressor', - 'TruncatedSVD', 'PolynomialFeatures', + 'TruncatedSVD', 'PolynomialFeatures', 'UnaryEncoder', 'GaussianRandomProjectionHash', 'HashingVectorizer', 'CheckingClassifier', 'PatchExtractor', 'CountVectorizer', # GradientBoosting base estimators, maybe should From eb8bc9459bcf415f690be51851543f1e12bc29d7 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Tue, 28 Nov 2017 19:37:14 -0500 Subject: [PATCH 04/14] Removing active_features_ attribute from UnaryEncoder as it is not needed --- sklearn/preprocessing/data.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 64db56c1fbe68..bf21f23ad8e81 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2925,16 +2925,10 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Attributes ---------- - active_features_ : array - Indices for active features, meaning values that actually occur - in the training set. All featurs are available when n_values is - ``'auto'``. - feature_indices_ : array of shape (n_features,) Indices to feature ranges. Feature ``i`` in the original data is mapped to features from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by `active_features_` afterwards) n_values_ : array of shape (n_features,) Maximum number of values per feature. @@ -2955,8 +2949,6 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) - >>> enc.active_features_ - array([0, 1, 2, 3, 4, 5]) >>> enc.transform([[0, 1, 1]]).toarray() array([[ 0., 1., 0., 1., 0., 0.]]) @@ -3037,10 +3029,6 @@ def _fit_transform(self, X): shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - self.active_features_ = np.arange(out.shape[1]) - return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): From 88d5eb4a596a5eeec594aff6a52c3bfcf4cd197d Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Wed, 29 Nov 2017 08:37:28 -0500 Subject: [PATCH 05/14] Limiting the lines in documentation to less that 80 chars --- doc/modules/preprocessing.rst | 69 ++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 549555e141f40..4e260c33aabb8 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -541,48 +541,51 @@ as a dict, not as scalars. Encoding ordinal features ============================= -Often categorical features have a clear ordering. For example a person could have features +Often categorical features have a clear ordering. For example a person could +have features ``["short", "tall"]``, ``["low income", "medium income", "high income"]``, -``["elementary school graduate", "high school graduate", "some college", "college graduate"]``. -Even though these features can be ordered, we shouldn't necessarily assign scores to them, -as the difference between categories one and two is not the same as the difference -between categories two and three. +``["elementary school graduate", "high school graduate", "some college", + "college graduate"]``. +Even though these features can be ordered, we shouldn't necessarily assign +scores to them, as the difference between categories one and two is not the +same as the difference between categories two and three. One possibility to convert these ordinal features to features that can be used with scikit-learn estimators is to use a unary encoding, which is implemented in :class:`UnaryEncoder`. This estimator transforms each -ordinal feature with ``m`` possible values into ``m - 1`` binary features, where the ith -feature is active if x > i (for i = 0, ... k - 1). +ordinal feature with ``m`` possible values into ``m - 1`` binary features, +where the ith feature is active if x > i (for i = 0, ... k - 1). Continuing the example above:: - >>> enc = preprocessing.UnaryEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values='auto', ordinal_features='all', sparse=True) - >>> enc.transform([[0, 1, 1]]).toarray() - array([[ 0., 1., 0., 1., 0., 0.]]) - -By default, how many values each feature can take is inferred automatically from the dataset. -It is possible to specify this explicitly using the parameter ``n_values``. -There are two genders, three possible continents and four web browsers in our -dataset. -Then we fit the estimator, and transform a data point. -In the result, the first number encodes the height, the next two numbers the income level, -and the next set of three numbers the education level. - -Note that, if there is a possibilty that the training data might have missing categorical -features, one has to explicitly set ``n_values``. For example, - - >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) - >>> # Note that there are missing categorical values for the 2nd and 3rd - >>> # features - >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values=[2, 3, 4], ordinal_features='all', sparse=True) - >>> enc.transform([[1, 1, 2]]).toarray() - array([[ 1., 1., 0., 1., 1., 0.]]) + >>> enc = preprocessing.UnaryEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values='auto', ordinal_features='all', sparse=True) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) + +By default, how many values each feature can take is inferred automatically +from the dataset. It is possible to specify this explicitly using the parameter +``n_values``. +* There are two genders, three possible continents and four web browsers in our + dataset. +* Then we fit the estimator, and transform a data point. +* In the result, the first number encodes the height, the next two numbers the + income level, and the next set of three numbers the education level. + +Note that, if there is a possibilty that the training data might have missing +categorical features, one has to explicitly set ``n_values``. For example,:: + + >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) + >>> # Note that there are missing categorical values for the 2nd and 3rd + >>> # features + >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values=[2, 3, 4], ordinal_features='all', sparse=True) + >>> enc.transform([[1, 1, 2]]).toarray() + array([[ 1., 1., 0., 1., 1., 0.]]) .. _imputation: From cd21cbfdd2b1d43e1bb73e1f387206fcfe393368 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Wed, 29 Nov 2017 18:38:44 -0500 Subject: [PATCH 06/14] Updated documentation. Changed the default value of sparse parameter to False --- doc/modules/preprocessing.rst | 10 ++++++---- sklearn/preprocessing/data.py | 33 ++++++++++++++------------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4e260c33aabb8..cc461afd5c0b9 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -543,10 +543,12 @@ Encoding ordinal features ============================= Often categorical features have a clear ordering. For example a person could have features -``["short", "tall"]``, -``["low income", "medium income", "high income"]``, -``["elementary school graduate", "high school graduate", "some college", - "college graduate"]``. + +* ``["short", "tall"]`` +* ``["low income", "medium income", "high income"]`` +* ``["elementary school graduate", "high school graduate", "some college", + "college graduate"]`` + Even though these features can be ordered, we shouldn't necessarily assign scores to them, as the difference between categories one and two is not the same as the difference between categories two and three. diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bf21f23ad8e81..93a4910b75add 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2885,7 +2885,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): the values taken on by ordinal (discrete) features. The output will be a sparse matrix where each column corresponds to one possible value of one feature. It is assumed that input features take on values in the range - [0, n_values). + 0 to (n_values - 1). This encoding is needed for feeding ordinal features to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. @@ -2916,7 +2916,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): dtype : number type, default=np.float Desired dtype of output. - sparse : boolean, default=True + sparse : boolean, default=False Will return sparse matrix if set True else will return an array. handle_unknown : str, 'error' or 'ignore' @@ -2941,35 +2941,26 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import UnaryEncoder >>> enc = UnaryEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ -[1, 0, 2]]) # doctest: +ELLIPSIS + >>> enc.fit([[0, 0, 3], + [1, 1, 0], + [0, 2, 1], + [1, 0, 2]]) # doctest: +ELLIPSIS UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values='auto', ordinal_features='all', sparse=True) + n_values='auto', ordinal_features='all', sparse=False) >>> enc.n_values_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) - >>> enc.transform([[0, 1, 1]]).toarray() - array([[ 0., 1., 0., 1., 0., 0.]]) + >>> enc.transform([[0, 1, 2]]).toarray() + array([[ 0., 1., 0., 1., 1., 0.]]) See also -------- - sklearn.feature_extraction.DictVectorizer : performs a Ordinal encoding of - dictionary items (also handles string-valued features). - sklearn.feature_extraction.FeatureHasher : performs an approximate Ordinal - encoding of dictionary items or strings. sklearn.preprocessing.OneHotEncoder: encodes categorical integer features using a one-hot aka one-of-K scheme. - sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all - fashion. - sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of - iterables and a multilabel format, e.g. a (samples x classes) binary - matrix indicating the presence of a class label. - sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 - and n_classes-1. """ def __init__(self, n_values="auto", ordinal_features="all", - dtype=np.float64, sparse=True, handle_unknown='error'): + dtype=np.float64, sparse=False, handle_unknown='error'): self.n_values = n_values self.ordinal_features = ordinal_features self.dtype = dtype @@ -2983,6 +2974,8 @@ def fit(self, X, y=None): ---------- X : array-like, shape [n_samples, n_feature] Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. """ self.fit_transform(X) return self @@ -3088,6 +3081,8 @@ def transform(self, X): ---------- X : array-like, shape [n_samples, n_features] Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. Returns ------- From 81af018d763720470ad650de2c73dee33da5338e Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Wed, 29 Nov 2017 19:34:54 -0500 Subject: [PATCH 07/14] Updated test cases to accomodate change in default value of sparse parameter. Also added a new test case test_unary_encoder_n_values_array --- sklearn/preprocessing/tests/test_data.py | 36 +++++++++++++++++++----- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 34246081094fc..d7d1b8b703000 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2236,13 +2236,13 @@ def _generate_random_features_matrix(n_values=3, size=10): def test_unary_encoder(): X = np.arange(5).reshape(-1, 1) enc = UnaryEncoder(5) - Xt = enc.fit_transform(X).toarray() + Xt = enc.fit_transform(X) assert_array_equal(Xt, [[0, 0, 0, 0], # 0 [1, 0, 0, 0], # 1 [1, 1, 0, 0], # 2 [1, 1, 1, 0], # 3 [1, 1, 1, 1]]) # 4 - Xt2 = enc.transform(X).toarray() + Xt2 = enc.transform(X) assert_array_equal(Xt2, Xt) @@ -2266,8 +2266,8 @@ def test_unary_encoder_dense_sparse(): n_values = np.random.randint(1, 10) size = np.random.randint(1, 10) - sparse_encoder = UnaryEncoder(n_values) - dense_encoder = UnaryEncoder(n_values, sparse=False) + sparse_encoder = UnaryEncoder(n_values, sparse=True) + dense_encoder = UnaryEncoder(n_values) X = _generate_random_features_matrix(n_values, size) X_trans_sparse = sparse_encoder.fit_transform(X) @@ -2289,7 +2289,7 @@ def test_unary_encoder_handle_unknown(): encoder = UnaryEncoder(handle_unknown='ignore') encoder.fit(X) assert_array_equal( - encoder.transform(y).toarray(), + encoder.transform(y), np.array([[0., 1., 0., 1., 0., 0.]])) # Raise error if handle_unknown is neither ignore or error. @@ -2355,11 +2355,11 @@ def test_unary_encoder_edge_cases(): for input_matrix, expected_matrix in EDGE_CASES: transformed = UnaryEncoder().fit_transform(input_matrix) - assert_array_equal(transformed.toarray(), expected_matrix) + assert_array_equal(transformed, expected_matrix) def test_unary_encoder_n_values_int(): - # Test UnaryEncoder's fit and transform. + # Test UnaryEncoder's n_values parameter when set as an int. n_values = np.random.randint(2, 10) size = np.random.randint(1, 10) delta = np.random.randint(1, 10) @@ -2375,3 +2375,25 @@ def test_unary_encoder_n_values_int(): enc.feature_indices_, np.arange(0, unary_n_values * len(X[0]) + 1, unary_n_values) ) + + +def test_unary_encoder_n_values_array(): + # Test UnaryEncoder's n_values parameter when set as an array. + n_features = np.random.randint(2, 10) + size = np.random.randint(1, 10) + delta = np.random.randint(1, 10) + + n_values_array = [n_features] * n_features + enc = UnaryEncoder(n_values=n_values_array) + X = _generate_random_features_matrix(n_features, size) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (size, sum(n_values_array) - n_features)) + + n_values_array = np.random.randint(2, 10, n_features + delta) + enc = UnaryEncoder(n_values=n_values_array) + X = _generate_random_features_matrix(n_features, size) + assert_raises(ValueError, enc.fit_transform, X) + + enc = UnaryEncoder(n_values=[]) + X = _generate_random_features_matrix(n_features, size) + assert_raises(ValueError, enc.fit_transform, X) From f4ba31003a8a47c0e832ff06c1611c2ca07261d4 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Thu, 30 Nov 2017 20:50:49 -0500 Subject: [PATCH 08/14] Commit to accomodate all the requested changes 1. Fixed test failures 2. Updated docs 3. UnaryEncoder: Changed handle_unknown to handle_greater Updated docs UnaryEncoder: Changed handle_unknown to handle_greater --- doc/modules/preprocessing.rst | 6 ++ sklearn/preprocessing/data.py | 80 +++++++++++++++--------- sklearn/preprocessing/tests/test_data.py | 18 +++--- 3 files changed, 64 insertions(+), 40 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index cc461afd5c0b9..9e925b6a562ac 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -559,6 +559,12 @@ implemented in :class:`UnaryEncoder`. This estimator transforms each ordinal feature with ``m`` possible values into ``m - 1`` binary features, where the ith feature is active if x > i (for i = 0, ... k - 1). +**NOTE**: This encoding is likely to help when used with linear models and +kernel-based models like SVMs with the standard kernels. On the other hand, this +transformation is unlikely to help when using with tree-based models, +since those already work on the basis of a particular feature value being +< or > than a threshold, unlike linear and kernel-based models. + Continuing the example above:: >>> enc = preprocessing.UnaryEncoder() diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 93a4910b75add..d24d649d4a847 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2002,7 +2002,7 @@ def _fit_transform(self, X): except (ValueError, TypeError): raise TypeError("Wrong type for parameter `n_values`. Expected" " 'auto', int or array of ints, got %r" - % self.n_values) + % type(X)) if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") @@ -2881,14 +2881,18 @@ def inverse_transform(self, X): class UnaryEncoder(BaseEstimator, TransformerMixin): """Encode ordinal integer features using a unary scheme. - The input to this transformer should be a matrix of integers, denoting - the values taken on by ordinal (discrete) features. The output will be - a sparse matrix where each column corresponds to one possible value of one - feature. It is assumed that input features take on values in the range + The input to this transformer should be a matrix of non-negative integers, + denoting the values taken on by ordinal (discrete) features. The output + will be a matrix where each column corresponds to one possible value of + one feature. It is assumed that input features take on values in the range 0 to (n_values - 1). This encoding is needed for feeding ordinal features to many scikit-learn - estimators, notably linear models and SVMs with the standard kernels. + estimators, notably linear models and kernel-based models like SVMs with + the standard kernels. + This transformation is unlikely to help when using with tree-based models, + since those already work on the basis of a particular feature value being + < or > than a threshold, unlike linear and kernel-based models. Read more in the :ref:`User Guide `. @@ -2919,9 +2923,10 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=False Will return sparse matrix if set True else will return an array. - handle_unknown : str, 'error' or 'ignore' - Whether to raise an error or ignore if a unknown ordinal feature is - present during transform. + handle_greater : str, 'error' or 'clip' + Whether to raise an error or clip if a greater ordinal feature value is + present during transform as compare to largest feature value seen + during fit. Attributes ---------- @@ -2942,16 +2947,16 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import UnaryEncoder >>> enc = UnaryEncoder() >>> enc.fit([[0, 0, 3], - [1, 1, 0], - [0, 2, 1], - [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + ... [1, 1, 0], + ... [0, 2, 1], + ... [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', n_values='auto', ordinal_features='all', sparse=False) >>> enc.n_values_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) - >>> enc.transform([[0, 1, 2]]).toarray() + >>> enc.transform([[0, 1, 2]]) array([[ 0., 1., 0., 1., 1., 0.]]) See also @@ -2960,12 +2965,12 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): using a one-hot aka one-of-K scheme. """ def __init__(self, n_values="auto", ordinal_features="all", - dtype=np.float64, sparse=False, handle_unknown='error'): + dtype=np.float64, sparse=False, handle_greater='error'): self.n_values = n_values self.ordinal_features = ordinal_features self.dtype = dtype self.sparse = sparse - self.handle_unknown = handle_unknown + self.handle_greater = handle_greater def fit(self, X, y=None): """Fit UnaryEncoder to X. @@ -3028,7 +3033,20 @@ def fit_transform(self, X, y=None): """Fit UnaryEncoder to X, then transform X. Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. See fit for the parameters, transform for the return value. + efficient. + + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. + + Returns + ------- + X_out : sparse matrix or a 2-d array + Transformed input. + """ return _transform_selected(X, self._fit_transform, self.ordinal_features, copy=True) @@ -3046,28 +3064,28 @@ def _transform(self, X): " Expected %d, got %d." % (indices.shape[0] - 1, n_features)) - # We use only those ordinal features of X that are known using fit. - # i.e lesser than n_values_ using mask. - # This means, if self.handle_unknown is "ignore", the row_indices and - # col_indices corresponding to the unknown ordinal feature are - # ignored. + # We clip those ordinal features of X that are greater than n_values_ + # using mask. + # This means, if self.handle_greater is "ignore", the row_indices and + # col_indices corresponding to the greater ordinal feature are all + # filled with ones. mask = (X < self.n_values_).ravel() if np.any(~mask): - if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either 'error' or " - "'ignore' got %s" % self.handle_unknown) - if self.handle_unknown == 'error': + if self.handle_greater not in ['error', 'clip']: + raise ValueError("handle_greater should be either 'error' or " + "'clip' got %s" % self.handle_greater) + if self.handle_greater == 'error': raise ValueError("unknown ordinal feature present %s " "during transform." % X.ravel()[~mask]) - column_start = np.tile(indices[:-1], n_samples)[mask] - column_end = (X + indices[:-1]).ravel()[mask] + X_ceil = np.where(mask.reshape(X.shape), X, self.n_values_ - 1) + column_start = np.tile(indices[:-1], n_samples) + column_end = (indices[:-1] + X_ceil).ravel() column_indices = np.hstack([np.arange(s, e) for s, e in zip(column_start, column_end)]) row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - np.where(mask.reshape(X.shape), X, - 0).sum(axis=1)) - data = np.ones(X.ravel()[mask].sum()) + X_ceil.sum(axis=1)) + data = np.ones(X_ceil.ravel().sum()) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index d7d1b8b703000..b4310e090f478 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2276,24 +2276,24 @@ def test_unary_encoder_dense_sparse(): assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) -def test_unary_encoder_handle_unknown(): - X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) +def test_unary_encoder_handle_greater(): + X = np.array([[0, 2, 1], [1, 0, 3], [2, 0, 2]]) y = np.array([[4, 1, 1]]) - # Test that encoder raises error for unknown features. - encoder = UnaryEncoder(handle_unknown='error') + # Test that encoder raises error for greater features. + encoder = UnaryEncoder(handle_greater='error') encoder.fit(X) assert_raises(ValueError, encoder.transform, y) - # Test the ignore option, ignores unknown features. - encoder = UnaryEncoder(handle_unknown='ignore') + # Test the ignore option, clips greater features. + encoder = UnaryEncoder(handle_greater='clip') encoder.fit(X) assert_array_equal( encoder.transform(y), - np.array([[0., 1., 0., 1., 0., 0.]])) + np.array([[1., 1., 1., 0., 1., 0., 0.]])) - # Raise error if handle_unknown is neither ignore or error. - encoder = UnaryEncoder(handle_unknown='42') + # Raise error if handle_greater is neither ignore or error. + encoder = UnaryEncoder(handle_greater='42') encoder.fit(X) assert_raises(ValueError, encoder.transform, y) From 0706c29bb215d59c4b41aa7afb0c02d40654c1e4 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Fri, 1 Dec 2017 01:23:26 -0500 Subject: [PATCH 09/14] Fixing DocTestFailure --- doc/modules/preprocessing.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 9e925b6a562ac..809179f3f9748 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -569,9 +569,9 @@ Continuing the example above:: >>> enc = preprocessing.UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values='auto', ordinal_features='all', sparse=True) - >>> enc.transform([[0, 1, 1]]).toarray() + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + n_values='auto', ordinal_features='all', sparse=False) + >>> enc.transform([[0, 1, 1]]) array([[ 0., 1., 0., 1., 0., 0.]]) By default, how many values each feature can take is inferred automatically @@ -590,9 +590,9 @@ categorical features, one has to explicitly set ``n_values``. For example,:: >>> # Note that there are missing categorical values for the 2nd and 3rd >>> # features >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values=[2, 3, 4], ordinal_features='all', sparse=True) - >>> enc.transform([[1, 1, 2]]).toarray() + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + n_values=[2, 3, 4], ordinal_features='all', sparse=False) + >>> enc.transform([[1, 1, 2]]) array([[ 1., 1., 0., 1., 1., 0.]]) .. _imputation: From b642a7e1f7db6471841457d3d8acafdee8fd1230 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Sun, 3 Dec 2017 00:10:36 -0500 Subject: [PATCH 10/14] Refactoring the code. Now fit_transform in equivalent to fit + transform. Parameters checking being done in fit method. Plus, new test cases. Updated docs Updating implementation of UnaryEncoder checking handle_greater parameter value in fit New test cases --- doc/modules/preprocessing.rst | 12 +++--- sklearn/preprocessing/data.py | 47 +++++++++------------ sklearn/preprocessing/tests/test_data.py | 54 +++++++++++++++++++++--- 3 files changed, 74 insertions(+), 39 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 809179f3f9748..e5633f7c7972b 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -559,11 +559,13 @@ implemented in :class:`UnaryEncoder`. This estimator transforms each ordinal feature with ``m`` possible values into ``m - 1`` binary features, where the ith feature is active if x > i (for i = 0, ... k - 1). -**NOTE**: This encoding is likely to help when used with linear models and -kernel-based models like SVMs with the standard kernels. On the other hand, this -transformation is unlikely to help when using with tree-based models, -since those already work on the basis of a particular feature value being -< or > than a threshold, unlike linear and kernel-based models. +.. note:: + + This encoding is likely to help when used with linear models and + kernel-based models like SVMs with the standard kernels. On the other hand, this + transformation is unlikely to help when using with tree-based models, + since those already work on the basis of a particular feature value being + < or > than a threshold, unlike linear and kernel-based models. Continuing the example above:: diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index d24d649d4a847..d199ba431f0b0 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2924,9 +2924,8 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Will return sparse matrix if set True else will return an array. handle_greater : str, 'error' or 'clip' - Whether to raise an error or clip if a greater ordinal feature value is - present during transform as compare to largest feature value seen - during fit. + Whether to raise an error or clip if an ordinal feature >= n_values is + passed in. Attributes ---------- @@ -2982,15 +2981,19 @@ def fit(self, X, y=None): All feature values should be non-negative otherwise will raise a ValueError. """ - self.fit_transform(X) + _transform_selected(X, self._fit, self.ordinal_features, copy=True) return self - def _fit_transform(self, X): + def _fit(self, X): """Assumes X contains only ordinal features.""" X = check_array(X, dtype=np.int) + if self.handle_greater not in ['error', 'clip']: + raise ValueError("handle_greater should be either 'error' or " + "'clip' got %s" % self.handle_greater) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape + if (isinstance(self.n_values, six.string_types) and self.n_values == 'auto'): n_values = np.max(X, axis=0) + 1 @@ -3016,24 +3019,18 @@ def _fit_transform(self, X): indices = np.cumsum(n_values) self.feature_indices_ = indices - column_start = np.tile(indices[:-1], n_samples) - column_end = (X + indices[:-1]).ravel() - column_indices = np.hstack([np.arange(s, e) for s, e - in zip(column_start, column_end)]) - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - X.sum(axis=1)) - data = np.ones(X.sum()) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() + mask = (X < self.n_values_).ravel() + if np.any(~mask): + if self.handle_greater == 'error': + raise ValueError("unknown ordinal feature present %s " + % X.ravel()[~mask]) - return out if self.sparse else out.toarray() + return X def fit_transform(self, X, y=None): """Fit UnaryEncoder to X, then transform X. - Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. + Equivalent to self.fit(X).transform(X), but more convenient. Parameters ---------- @@ -3048,8 +3045,8 @@ def fit_transform(self, X, y=None): Transformed input. """ - return _transform_selected(X, self._fit_transform, - self.ordinal_features, copy=True) + + return self.fit(X).transform(X) def _transform(self, X): """Assumes X contains only ordinal features.""" @@ -3065,15 +3062,11 @@ def _transform(self, X): % (indices.shape[0] - 1, n_features)) # We clip those ordinal features of X that are greater than n_values_ - # using mask. - # This means, if self.handle_greater is "ignore", the row_indices and - # col_indices corresponding to the greater ordinal feature are all - # filled with ones. + # using mask if self.handle_greater is "clip". + # This means, the row_indices and col_indices corresponding to the + # greater ordinal feature are all filled with ones. mask = (X < self.n_values_).ravel() if np.any(~mask): - if self.handle_greater not in ['error', 'clip']: - raise ValueError("handle_greater should be either 'error' or " - "'clip' got %s" % self.handle_greater) if self.handle_greater == 'error': raise ValueError("unknown ordinal feature present %s " "during transform." % X.ravel()[~mask]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index b4310e090f478..caae825de8830 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2280,22 +2280,47 @@ def test_unary_encoder_handle_greater(): X = np.array([[0, 2, 1], [1, 0, 3], [2, 0, 2]]) y = np.array([[4, 1, 1]]) - # Test that encoder raises error for greater features. + # Test that encoder raises error for greater features during transform. encoder = UnaryEncoder(handle_greater='error') encoder.fit(X) assert_raises(ValueError, encoder.transform, y) - # Test the ignore option, clips greater features. + encoder = UnaryEncoder(handle_greater='error') + assert_array_equal(encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Test that encoder raises error for greater features during fit when + # n_values is explicitly set. + encoder = UnaryEncoder(handle_greater='error', n_values=[2, 3, 4]) + assert_raises(ValueError, encoder.fit, X) + + encoder = UnaryEncoder(handle_greater='error', n_values=[2, 3, 4]) + assert_raises(ValueError, encoder.fit_transform, X) + + encoder = UnaryEncoder(handle_greater='error', n_values=[5, 2, 2]) + encoder.fit(y) + assert_array_equal(encoder.transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + encoder = UnaryEncoder(handle_greater='error', n_values=[5, 2, 2]) + assert_array_equal(encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Test the clip option. encoder = UnaryEncoder(handle_greater='clip') encoder.fit(X) assert_array_equal( encoder.transform(y), np.array([[1., 1., 1., 0., 1., 0., 0.]])) - # Raise error if handle_greater is neither ignore or error. + encoder = UnaryEncoder(handle_greater='clip', n_values=[3, 2, 2]) + assert_array_equal( + encoder.fit_transform(y), + np.array([[1., 1., 1., 1.]])) + + # Raise error if handle_greater is neither clip nor error. encoder = UnaryEncoder(handle_greater='42') - encoder.fit(X) - assert_raises(ValueError, encoder.transform, y) + assert_raises(ValueError, encoder.fit, y) def test_unary_encoder_errors(): @@ -2375,6 +2400,10 @@ def test_unary_encoder_n_values_int(): enc.feature_indices_, np.arange(0, unary_n_values * len(X[0]) + 1, unary_n_values) ) + assert_array_equal( + enc.n_values_, + np.array([encoder_n_values] * len(X[0])) + ) def test_unary_encoder_n_values_array(): @@ -2383,17 +2412,28 @@ def test_unary_encoder_n_values_array(): size = np.random.randint(1, 10) delta = np.random.randint(1, 10) - n_values_array = [n_features] * n_features - enc = UnaryEncoder(n_values=n_values_array) + # Test ideal case is working fine X = _generate_random_features_matrix(n_features, size) + n_values_array = list(np.max(X, axis=0) + 1) + enc = UnaryEncoder(n_values=n_values_array) X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (size, sum(n_values_array) - n_features)) + assert_array_equal( + enc.feature_indices_, + np.cumsum(np.array([1] + n_values_array) - 1) + ) + assert_array_equal( + enc.n_values_, + np.array(n_values_array) + ) + # Test that fit_transform raises error when len(n_values) != n_features n_values_array = np.random.randint(2, 10, n_features + delta) enc = UnaryEncoder(n_values=n_values_array) X = _generate_random_features_matrix(n_features, size) assert_raises(ValueError, enc.fit_transform, X) + # Test that fit_transform raises error when len(n_values) != n_features enc = UnaryEncoder(n_values=[]) X = _generate_random_features_matrix(n_features, size) assert_raises(ValueError, enc.fit_transform, X) From 367dba4cafca50cc36efbfd49ec3ee1e2e14573a Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Sun, 3 Dec 2017 19:22:27 -0500 Subject: [PATCH 11/14] Minor change in mask calculation. Removing fit_transform for UnaryEncoder and relying on one defined in TransformerMixin Update handle_greater=error error message Removing fit_transform for UnaryEncoder and relying on one defined in TransformerMixin Update handle_greater=error error message --- sklearn/preprocessing/data.py | 37 ++++++++--------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index d199ba431f0b0..28727d4302bf2 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -3019,35 +3019,14 @@ def _fit(self, X): indices = np.cumsum(n_values) self.feature_indices_ = indices - mask = (X < self.n_values_).ravel() - if np.any(~mask): + mask = (X >= self.n_values_).ravel() + if np.any(mask): if self.handle_greater == 'error': - raise ValueError("unknown ordinal feature present %s " - % X.ravel()[~mask]) + raise ValueError("handle_greater='error' but %d feature values" + " exceed n_values" % np.count_nonzero(mask)) return X - def fit_transform(self, X, y=None): - """Fit UnaryEncoder to X, then transform X. - - Equivalent to self.fit(X).transform(X), but more convenient. - - Parameters - ---------- - X : array-like, shape [n_samples, n_feature] - Input array of type int. - All feature values should be non-negative otherwise will raise a - ValueError. - - Returns - ------- - X_out : sparse matrix or a 2-d array - Transformed input. - - """ - - return self.fit(X).transform(X) - def _transform(self, X): """Assumes X contains only ordinal features.""" X = check_array(X, dtype=np.int) @@ -3065,13 +3044,13 @@ def _transform(self, X): # using mask if self.handle_greater is "clip". # This means, the row_indices and col_indices corresponding to the # greater ordinal feature are all filled with ones. - mask = (X < self.n_values_).ravel() - if np.any(~mask): + mask = (X >= self.n_values_).ravel() + if np.any(mask): if self.handle_greater == 'error': raise ValueError("unknown ordinal feature present %s " - "during transform." % X.ravel()[~mask]) + "during transform." % X.ravel()[mask]) - X_ceil = np.where(mask.reshape(X.shape), X, self.n_values_ - 1) + X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X) column_start = np.tile(indices[:-1], n_samples) column_end = (indices[:-1] + X_ceil).ravel() column_indices = np.hstack([np.arange(s, e) for s, e From 9f3205dfcef9fe6ba56dc1fb982da9f1db66f057 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Mon, 4 Dec 2017 00:38:57 -0500 Subject: [PATCH 12/14] Adding warn as a new option for handle_greater parameter. Updaing warn mode Making warn as default mode --- sklearn/preprocessing/data.py | 28 ++++++++++++++---------- sklearn/preprocessing/tests/test_data.py | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 28727d4302bf2..3296ae53bfd82 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2923,9 +2923,9 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=False Will return sparse matrix if set True else will return an array. - handle_greater : str, 'error' or 'clip' - Whether to raise an error or clip if an ordinal feature >= n_values is - passed in. + handle_greater : str, 'warn' or 'error' or 'clip', default='warn' + Whether to raise an error or clip or warn if an + ordinal feature >= n_values is passed in. Attributes ---------- @@ -2949,7 +2949,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): ... [1, 1, 0], ... [0, 2, 1], ... [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', n_values='auto', ordinal_features='all', sparse=False) >>> enc.n_values_ array([2, 3, 4]) @@ -2964,7 +2964,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): using a one-hot aka one-of-K scheme. """ def __init__(self, n_values="auto", ordinal_features="all", - dtype=np.float64, sparse=False, handle_greater='error'): + dtype=np.float64, sparse=False, handle_greater='warn'): self.n_values = n_values self.ordinal_features = ordinal_features self.dtype = dtype @@ -2987,9 +2987,9 @@ def fit(self, X, y=None): def _fit(self, X): """Assumes X contains only ordinal features.""" X = check_array(X, dtype=np.int) - if self.handle_greater not in ['error', 'clip']: - raise ValueError("handle_greater should be either 'error' or " - "'clip' got %s" % self.handle_greater) + if self.handle_greater not in ['warn', 'error', 'clip']: + raise ValueError("handle_greater should be either 'warn', 'error' " + "or 'clip' got %s" % self.handle_greater) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape @@ -3023,7 +3023,7 @@ def _fit(self, X): if np.any(mask): if self.handle_greater == 'error': raise ValueError("handle_greater='error' but %d feature values" - " exceed n_values" % np.count_nonzero(mask)) + " exceed n_values." % np.count_nonzero(mask)) return X @@ -3046,9 +3046,13 @@ def _transform(self, X): # greater ordinal feature are all filled with ones. mask = (X >= self.n_values_).ravel() if np.any(mask): - if self.handle_greater == 'error': - raise ValueError("unknown ordinal feature present %s " - "during transform." % X.ravel()[mask]) + if self.handle_greater == 'warn': + warnings.warn("Found feature values which " + "exceeds n_values during transform.") + elif self.handle_greater == 'error': + raise ValueError("Found feature values %s which exceeds " + "n_values during transform." + % X.ravel()[mask]) X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X) column_start = np.tile(indices[:-1], n_samples) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index caae825de8830..11a7948d138ae 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2345,7 +2345,7 @@ def test_unary_encoder_errors(): X[0][0] = n_values + delta X_out_of_bounds = X assert_raises(ValueError, encoder.transform, X_out_of_bounds) - error_msg = ("unknown ordinal feature present \[{}\] " + error_msg = ("Found feature values \[{}\] which exceeds n_values " "during transform.".format(n_values + delta)) assert_raises_regex(ValueError, error_msg, encoder.transform, X_out_of_bounds) From c23ec8d9290aa6cab5d70133bace50c1543e3e4c Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Mon, 4 Dec 2017 01:02:36 -0500 Subject: [PATCH 13/14] Updating test case to take care of new handle_greater='warn' as default and updating docs Updating docs --- doc/modules/preprocessing.rst | 4 ++-- sklearn/preprocessing/tests/test_data.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e5633f7c7972b..1cf46e69695bb 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -571,7 +571,7 @@ Continuing the example above:: >>> enc = preprocessing.UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', n_values='auto', ordinal_features='all', sparse=False) >>> enc.transform([[0, 1, 1]]) array([[ 0., 1., 0., 1., 0., 0.]]) @@ -592,7 +592,7 @@ categorical features, one has to explicitly set ``n_values``. For example,:: >>> # Note that there are missing categorical values for the 2nd and 3rd >>> # features >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', n_values=[2, 3, 4], ordinal_features='all', sparse=False) >>> enc.transform([[1, 1, 2]]) array([[ 1., 1., 0., 1., 1., 0.]]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 11a7948d138ae..3627a1e0967e6 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2342,6 +2342,9 @@ def test_unary_encoder_errors(): assert_raises(ValueError, UnaryEncoder(n_values=1).fit_transform, X) # test that an error is raised when out of bounds + encoder = UnaryEncoder(n_values, handle_greater='error') + X = _generate_random_features_matrix(n_values, size) + encoder.fit(X) X[0][0] = n_values + delta X_out_of_bounds = X assert_raises(ValueError, encoder.transform, X_out_of_bounds) From 9d4753a7ddefdcd0da9a2d4fd0e5f85ea0e71b98 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Mon, 4 Dec 2017 20:17:56 -0500 Subject: [PATCH 14/14] Fixing concerns. Explaining handle_greater options Adding test cases to test warn option of handle_greater parameter Updated warning message updated feature_matrix generation function and made test deterministic making test cases clearer by using n_features n_values and n_features cleanup --- sklearn/preprocessing/data.py | 26 ++++--- sklearn/preprocessing/tests/test_data.py | 93 +++++++++++++++--------- 2 files changed, 73 insertions(+), 46 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 3296ae53bfd82..a87d389d0d916 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2923,10 +2923,15 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=False Will return sparse matrix if set True else will return an array. - handle_greater : str, 'warn' or 'error' or 'clip', default='warn' + handle_greater : str, 'warn' or 'error' or 'clip' Whether to raise an error or clip or warn if an ordinal feature >= n_values is passed in. + - 'warn' (default): same as clip but with warning. + - 'error': raise error if feature >= n_values is passed in. + - 'clip': all the feature values >= n_values are clipped to + (n_values-1) during transform. + Attributes ---------- feature_indices_ : array of shape (n_features,) @@ -2998,9 +3003,6 @@ def _fit(self, X): self.n_values == 'auto'): n_values = np.max(X, axis=0) + 1 elif isinstance(self.n_values, numbers.Integral): - if (np.max(X, axis=0) >= self.n_values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self.n_values) n_values = np.empty(n_features, dtype=np.int) n_values.fill(self.n_values) else: @@ -3022,8 +3024,9 @@ def _fit(self, X): mask = (X >= self.n_values_).ravel() if np.any(mask): if self.handle_greater == 'error': - raise ValueError("handle_greater='error' but %d feature values" - " exceed n_values." % np.count_nonzero(mask)) + raise ValueError("handle_greater='error' but found %d feature" + " values which exceeds n_values." + % np.count_nonzero(mask)) return X @@ -3047,12 +3050,13 @@ def _transform(self, X): mask = (X >= self.n_values_).ravel() if np.any(mask): if self.handle_greater == 'warn': - warnings.warn("Found feature values which " - "exceeds n_values during transform.") + warnings.warn("Found %d feature values which exceeds " + "n_values during transform, clipping them." + % np.count_nonzero(mask)) elif self.handle_greater == 'error': - raise ValueError("Found feature values %s which exceeds " - "n_values during transform." - % X.ravel()[mask]) + raise ValueError("handle_greater='error' but found %d feature" + " values which exceeds n_values during " + "transform." % np.count_nonzero(mask)) X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X) column_start = np.tile(indices[:-1], n_samples) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 3627a1e0967e6..d94a761a911ad 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2227,9 +2227,10 @@ def test_quantile_transform_valid_axis(): ". Got axis=2", quantile_transform, X.T, axis=2) -def _generate_random_features_matrix(n_values=3, size=10): - rng = np.random.RandomState(0) - X = rng.randint(n_values, size=(size, n_values)) +def _generate_random_features_matrix(n_samples=10, n_features=3, + n_values_max=3): + rng = np.random.RandomState(6) + X = rng.randint(n_values_max, size=(n_samples, n_features)) return X @@ -2248,13 +2249,13 @@ def test_unary_encoder(): def test_unary_encoder_stack(): # multiple input features stack to same output - n_values = np.random.randint(2, 10) - size = np.random.randint(1, 10) + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) encoder = UnaryEncoder(n_values, sparse=False) - X_multi = _generate_random_features_matrix(n_values, size) + X_multi = _generate_random_features_matrix(size, n_features, n_values) X_multi_t = encoder.fit_transform(X_multi) - assert_equal(X_multi_t.shape, (size, n_values * (n_values - 1))) + assert_equal(X_multi_t.shape, (size, n_features * (n_values - 1))) expected = np.hstack([encoder.fit_transform(X_multi[:, i:(i + 1)]) for i in range(X_multi.shape[1])]) @@ -2263,13 +2264,13 @@ def test_unary_encoder_stack(): def test_unary_encoder_dense_sparse(): # test dense output in comparison to sparse results. - n_values = np.random.randint(1, 10) - size = np.random.randint(1, 10) + n_values = rng.randint(1, 10) + size = rng.randint(1, 10) sparse_encoder = UnaryEncoder(n_values, sparse=True) dense_encoder = UnaryEncoder(n_values) - X = _generate_random_features_matrix(n_values, size) + X = _generate_random_features_matrix(size, n_features, n_values) X_trans_sparse = sparse_encoder.fit_transform(X) X_trans_dense = dense_encoder.fit_transform(X) @@ -2318,38 +2319,60 @@ def test_unary_encoder_handle_greater(): encoder.fit_transform(y), np.array([[1., 1., 1., 1.]])) + # Test the warn option. + encoder = UnaryEncoder() + encoder.fit(X) + w = ('Found 1 feature values which exceeds n_values during transform, ' + 'clipping them.') + y_transformed = assert_warns_message(UserWarning, w, encoder.transform, y) + assert_array_equal( + y_transformed, + np.array([[1., 1., 1., 0., 1., 0., 0.]])) + + encoder = UnaryEncoder(n_values=[3, 2, 2]) + y_transformed = assert_warns_message(UserWarning, w, + encoder.fit_transform, y) + assert_array_equal( + y_transformed, + np.array([[1., 1., 1., 1.]])) + + encoder = UnaryEncoder(n_values=[5, 2, 2]) + assert_array_equal( + encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + # Raise error if handle_greater is neither clip nor error. encoder = UnaryEncoder(handle_greater='42') assert_raises(ValueError, encoder.fit, y) def test_unary_encoder_errors(): - n_values = np.random.randint(2, 10) - size = np.random.randint(1, 10) - delta = np.random.randint(1, 10) + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) encoder = UnaryEncoder(n_values) - X = _generate_random_features_matrix(n_values, size) + X = _generate_random_features_matrix(size, n_features, n_values) encoder.fit(X) # test that an error is raised when different shape - larger_n_values = n_values + delta - X_too_large = _generate_random_features_matrix(larger_n_values, size) + larger_n_features = n_features + delta + X_too_large = _generate_random_features_matrix(size, larger_n_features, + n_values) assert_raises(ValueError, encoder.transform, X_too_large) error_msg = ("X has different shape than during fitting." - " Expected {}, got {}.".format(n_values, larger_n_values)) + " Expected {}, got {}.".format(n_features, larger_n_features)) assert_raises_regex(ValueError, error_msg, encoder.transform, X_too_large) - assert_raises(ValueError, UnaryEncoder(n_values=1).fit_transform, X) # test that an error is raised when out of bounds encoder = UnaryEncoder(n_values, handle_greater='error') - X = _generate_random_features_matrix(n_values, size) + X = _generate_random_features_matrix(size, n_features, n_values) encoder.fit(X) X[0][0] = n_values + delta X_out_of_bounds = X assert_raises(ValueError, encoder.transform, X_out_of_bounds) - error_msg = ("Found feature values \[{}\] which exceeds n_values " - "during transform.".format(n_values + delta)) + error_msg = ("handle_greater='error' but found 1 feature values which " + "exceeds n_values during transform.") assert_raises_regex(ValueError, error_msg, encoder.transform, X_out_of_bounds) @@ -2388,35 +2411,35 @@ def test_unary_encoder_edge_cases(): def test_unary_encoder_n_values_int(): # Test UnaryEncoder's n_values parameter when set as an int. - n_values = np.random.randint(2, 10) - size = np.random.randint(1, 10) - delta = np.random.randint(1, 10) + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) encoder_n_values = n_values + delta unary_n_values = encoder_n_values - 1 enc = UnaryEncoder(n_values=encoder_n_values) - X = _generate_random_features_matrix(n_values, size) + X = _generate_random_features_matrix(size, n_features, n_values) X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (size, unary_n_values * len(X[0]))) + assert_equal(X_trans.shape, (size, unary_n_values * n_features)) assert_array_equal( enc.feature_indices_, - np.arange(0, unary_n_values * len(X[0]) + 1, unary_n_values) + np.arange(0, (unary_n_values * n_features) + 1, unary_n_values) ) assert_array_equal( enc.n_values_, - np.array([encoder_n_values] * len(X[0])) + np.array([encoder_n_values] * n_features) ) def test_unary_encoder_n_values_array(): # Test UnaryEncoder's n_values parameter when set as an array. - n_features = np.random.randint(2, 10) - size = np.random.randint(1, 10) - delta = np.random.randint(1, 10) + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) # Test ideal case is working fine - X = _generate_random_features_matrix(n_features, size) + X = _generate_random_features_matrix(size, n_features, n_values) n_values_array = list(np.max(X, axis=0) + 1) enc = UnaryEncoder(n_values=n_values_array) X_trans = enc.fit_transform(X) @@ -2431,12 +2454,12 @@ def test_unary_encoder_n_values_array(): ) # Test that fit_transform raises error when len(n_values) != n_features - n_values_array = np.random.randint(2, 10, n_features + delta) + n_values_array = rng.randint(2, 10, n_features + delta) enc = UnaryEncoder(n_values=n_values_array) - X = _generate_random_features_matrix(n_features, size) + X = _generate_random_features_matrix(size, n_features, n_values) assert_raises(ValueError, enc.fit_transform, X) # Test that fit_transform raises error when len(n_values) != n_features enc = UnaryEncoder(n_values=[]) - X = _generate_random_features_matrix(n_features, size) + X = _generate_random_features_matrix(size, n_features, n_values) assert_raises(ValueError, enc.fit_transform, X)