From 38537ed2483ae948e481c3c05613e22765e803cb Mon Sep 17 00:00:00 2001 From: Arjun-Jauhari Date: Tue, 28 Mar 2017 12:38:40 -0400 Subject: [PATCH 01/34] Rebase and commit the OrdinalEncoder implementation --- sklearn/preprocessing/data.py | 218 ++++++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index b4549e09e6291..7ecaf4d717dcc 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2873,3 +2873,221 @@ def inverse_transform(self, X): X_tr[mask, idx] = None return X_tr + + +class OrdinalEncoder(BaseEstimator, TransformerMixin): + """Encode ordinal integer features using a unary scheme. + + The input to this transformer should be a matrix of integers, denoting + the values taken on by ordinal (discrete) features. The output will be + a matrix where all the columns with index lower than feature value will + be active. It is assumed that input features take on values in the range + [0, n_values). + + This encoding is needed for feeding ordinal data to many scikit-learn + estimators, notably linear models and SVMs with the standard kernels. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + n_values : 'auto', int or array of ints + Number of values per feature. + + - 'auto' : determine value range from training data. + - int : number of ordinal values per feature. + Each feature value should be in ``range(n_values)`` + - array : ``n_values[i]`` is the number of ordinal values in + ``X[:, i]``. Each feature value should be + in ``range(n_values[i])`` + + ordinal_features : "all" or array of indices or mask + Specify what features are treated as ordinal. + + - 'all' (default): All features are treated as ordinal. + - array of indices: Array of ordinal feature indices. + - mask: Array of length n_features and with dtype=bool. + + Non-ordinal features are always stacked to the right of the matrix. + + dtype : number type, default=np.float + Desired dtype of output. + + sparse : boolean, default=True + Will return sparse matrix if set True else will return an array. + + handle_unknown : str, 'error' or 'ignore' + Whether to raise an error or ignore if a unknown ordinal feature is + present during transform. + + Attributes + ---------- + feature_indices_ : array of shape (n_features,) + Indices to feature ranges. + Feature ``i`` in the original data is mapped to features + from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + + n_values_ : array of shape (n_features,) + Maximum number of values per feature. + + Examples + -------- + Given a dataset with three features and four samples, we let the encoder + find the maximum value per feature and transform the data to a binary + Ordinal encoding. + + >>> from sklearn.preprocessing import OrdinalEncoder + >>> enc = OrdinalEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ +[1, 0, 2]]) # doctest: +ELLIPSIS + OrdinalEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values='auto', ordinal_features='all', sparse=True) + >>> enc.n_values_ + array([2, 3, 4]) + >>> enc.feature_indices_ + array([0, 1, 3, 6]) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) + + See also + -------- + sklearn.feature_extraction.DictVectorizer : performs a Ordinal encoding of + dictionary items (also handles string-valued features). + sklearn.feature_extraction.FeatureHasher : performs an approximate Ordinal + encoding of dictionary items or strings. + sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all + fashion. + sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of + iterables and a multilabel format, e.g. a (samples x classes) binary + matrix indicating the presence of a class label. + sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 + and n_classes-1. + """ + def __init__(self, n_values="auto", ordinal_features="all", + dtype=np.float64, sparse=True, handle_unknown='error'): + self.n_values = n_values + self.ordinal_features = ordinal_features + self.dtype = dtype + self.sparse = sparse + self.handle_unknown = handle_unknown + + def fit(self, X, y=None): + """Fit OrdinalEncoder to X. + + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Input array of type int. + """ + self.fit_transform(X) + return self + + def _fit_transform(self, X): + """Assumes X contains only ordinal features.""" + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + if (isinstance(self.n_values, six.string_types) and + self.n_values == 'auto'): + n_values = np.max(X, axis=0) + 1 + elif isinstance(self.n_values, numbers.Integral): + if (np.max(X, axis=0) >= self.n_values).any(): + raise ValueError("Feature out of bounds for n_values=%d" + % self.n_values) + n_values = np.empty(n_features, dtype=np.int) + n_values.fill(self.n_values) + else: + try: + n_values = np.asarray(self.n_values, dtype=int) + except (ValueError, TypeError): + raise TypeError("Wrong type for parameter `n_values`. Expected" + " 'auto', int or array of ints, got %r" + % self.n_values) + if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if n_values is an array," + " it has to be of shape (n_features,).") + + self.n_values_ = n_values + n_values = np.hstack([[0], n_values - 1]) + indices = np.cumsum(n_values) + self.feature_indices_ = indices + + column_start = np.tile(indices[:-1], n_samples) + column_end = (X + indices[:-1]).ravel() + column_indices = np.hstack([np.arange(s, e) for s, e + in zip(column_start, column_end)]) + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + X.sum(axis=1)) + data = np.ones(X.sum()) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + + return out if self.sparse else out.toarray() + + def fit_transform(self, X, y=None): + """Fit OrdinalEncoder to X, then transform X. + + Equivalent to self.fit(X).transform(X), but more convenient and more + efficient. See fit for the parameters, transform for the return value. + """ + return _transform_selected(X, self._fit_transform, + self.ordinal_features, copy=True) + + def _transform(self, X): + """Assumes X contains only ordinal features.""" + X = check_array(X, dtype=np.int) + if np.any(X < 0): + raise ValueError("X needs to contain only non-negative integers.") + n_samples, n_features = X.shape + + indices = self.feature_indices_ + if n_features != indices.shape[0] - 1: + raise ValueError("X has different shape than during fitting." + " Expected %d, got %d." + % (indices.shape[0] - 1, n_features)) + + # We use only those ordinal features of X that are known using fit. + # i.e lesser than n_values_ using mask. + # This means, if self.handle_unknown is "ignore", the row_indices and + # col_indices corresponding to the unknown ordinal feature are + # ignored. + mask = (X < self.n_values_).ravel() + if np.any(~mask): + if self.handle_unknown not in ['error', 'ignore']: + raise ValueError("handle_unknown should be either 'error' or " + "'ignore' got %s" % self.handle_unknown) + if self.handle_unknown == 'error': + raise ValueError("unknown ordinal feature present %s " + "during transform." % X.ravel()[~mask]) + + column_start = np.tile(indices[:-1], n_samples)[mask] + column_end = (X + indices[:-1]).ravel()[mask] + column_indices = np.hstack([np.arange(s, e) for s, e + in zip(column_start, column_end)]) + row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), + np.where(mask.reshape(X.shape), X, + 0).sum(axis=1)) + data = np.ones(X.ravel()[mask].sum()) + out = sparse.coo_matrix((data, (row_indices, column_indices)), + shape=(n_samples, indices[-1]), + dtype=self.dtype).tocsr() + + return out if self.sparse else out.toarray() + + def transform(self, X): + """Transform X using Ordinal encoding. + + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + Input array of type int. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + Transformed input. + """ + return _transform_selected(X, self._transform, + self.ordinal_features, copy=True) From a85008280434461ca8eb8b00fff9c03aff1e6e88 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Sat, 24 Jun 2017 16:25:39 -0400 Subject: [PATCH 02/34] Updating name to UnaryEncoder and adding single quote in error string --- sklearn/preprocessing/__init__.py | 2 ++ sklearn/preprocessing/data.py | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 0f5054e57f608..3a24e7aec8234 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -23,6 +23,7 @@ from .data import quantile_transform from .data import OneHotEncoder from .data import CategoricalEncoder +from .data import UnaryEncoder from .data import PolynomialFeatures @@ -60,4 +61,5 @@ 'minmax_scale', 'label_binarize', 'quantile_transform', + 'UnaryEncoder' ] diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 7ecaf4d717dcc..8c24b5f3d97a5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -59,6 +59,7 @@ 'maxabs_scale', 'minmax_scale', 'quantile_transform', + 'UnaryEncoder' ] @@ -1999,7 +2000,7 @@ def _fit_transform(self, X): except (ValueError, TypeError): raise TypeError("Wrong type for parameter `n_values`. Expected" " 'auto', int or array of ints, got %r" - % type(X)) + % self.n_values) if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") @@ -2061,8 +2062,8 @@ def _transform(self, X): mask = (X < self.n_values_).ravel() if np.any(~mask): if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either error or " - "unknown got %s" % self.handle_unknown) + raise ValueError("handle_unknown should be either 'error' or " + "'ignore' got %s" % self.handle_unknown) if self.handle_unknown == 'error': raise ValueError("unknown categorical feature present %s " "during transform." % X.ravel()[~mask]) @@ -2875,7 +2876,7 @@ def inverse_transform(self, X): return X_tr -class OrdinalEncoder(BaseEstimator, TransformerMixin): +class UnaryEncoder(BaseEstimator, TransformerMixin): """Encode ordinal integer features using a unary scheme. The input to this transformer should be a matrix of integers, denoting @@ -2936,11 +2937,11 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): find the maximum value per feature and transform the data to a binary Ordinal encoding. - >>> from sklearn.preprocessing import OrdinalEncoder - >>> enc = OrdinalEncoder() + >>> from sklearn.preprocessing import UnaryEncoder + >>> enc = UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ [1, 0, 2]]) # doctest: +ELLIPSIS - OrdinalEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', n_values='auto', ordinal_features='all', sparse=True) >>> enc.n_values_ array([2, 3, 4]) @@ -2972,7 +2973,7 @@ def __init__(self, n_values="auto", ordinal_features="all", self.handle_unknown = handle_unknown def fit(self, X, y=None): - """Fit OrdinalEncoder to X. + """Fit UnaryEncoder to X. Parameters ---------- @@ -3027,7 +3028,7 @@ def _fit_transform(self, X): return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): - """Fit OrdinalEncoder to X, then transform X. + """Fit UnaryEncoder to X, then transform X. Equivalent to self.fit(X).transform(X), but more convenient and more efficient. See fit for the parameters, transform for the return value. From 661951c3ede340d3dd0517e4a7cbbfc8bd6a9a79 Mon Sep 17 00:00:00 2001 From: Ruxandra Burtica Date: Sun, 25 Jun 2017 14:23:24 +0300 Subject: [PATCH 03/34] Merged changes from #9216 --- doc/modules/classes.rst | 1 + doc/modules/preprocessing.rst | 47 +++++++ sklearn/preprocessing/data.py | 28 ++++- sklearn/preprocessing/tests/test_data.py | 151 +++++++++++++++++++++++ sklearn/utils/testing.py | 2 +- 5 files changed, 222 insertions(+), 7 deletions(-) diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index f1a2e973d187f..1a61edb3a2845 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1199,6 +1199,7 @@ Model validation preprocessing.Normalizer preprocessing.OneHotEncoder preprocessing.CategoricalEncoder + preprocessing.UnaryEncoder preprocessing.PolynomialFeatures preprocessing.QuantileTransformer preprocessing.RobustScaler diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 969a2921b4061..549555e141f40 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -537,6 +537,53 @@ columns for this feature will be all zeros See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. +.. _preprocessing_ordinal_features: + +Encoding ordinal features +============================= +Often categorical features have a clear ordering. For example a person could have features +``["short", "tall"]``, +``["low income", "medium income", "high income"]``, +``["elementary school graduate", "high school graduate", "some college", "college graduate"]``. +Even though these features can be ordered, we shouldn't necessarily assign scores to them, +as the difference between categories one and two is not the same as the difference +between categories two and three. + +One possibility to convert these ordinal features to features that can be used +with scikit-learn estimators is to use a unary encoding, which is +implemented in :class:`UnaryEncoder`. This estimator transforms each +ordinal feature with ``m`` possible values into ``m - 1`` binary features, where the ith +feature is active if x > i (for i = 0, ... k - 1). + +Continuing the example above:: + + >>> enc = preprocessing.UnaryEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values='auto', ordinal_features='all', sparse=True) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) + +By default, how many values each feature can take is inferred automatically from the dataset. +It is possible to specify this explicitly using the parameter ``n_values``. +There are two genders, three possible continents and four web browsers in our +dataset. +Then we fit the estimator, and transform a data point. +In the result, the first number encodes the height, the next two numbers the income level, +and the next set of three numbers the education level. + +Note that, if there is a possibilty that the training data might have missing categorical +features, one has to explicitly set ``n_values``. For example, + + >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) + >>> # Note that there are missing categorical values for the 2nd and 3rd + >>> # features + >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values=[2, 3, 4], ordinal_features='all', sparse=True) + >>> enc.transform([[1, 1, 2]]).toarray() + array([[ 1., 1., 0., 1., 1., 0.]]) + .. _imputation: Imputation of missing values diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 8c24b5f3d97a5..64db56c1fbe68 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1955,6 +1955,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): matrix indicating the presence of a class label. sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 and n_classes-1. + sklearn.preprocessing.UnaryEncoder: encodes ordinal integer features + using a unary scheme. """ def __init__(self, n_values="auto", categorical_features="all", dtype=np.float64, sparse=True, handle_unknown='error'): @@ -2881,14 +2883,14 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): The input to this transformer should be a matrix of integers, denoting the values taken on by ordinal (discrete) features. The output will be - a matrix where all the columns with index lower than feature value will - be active. It is assumed that input features take on values in the range + a sparse matrix where each column corresponds to one possible value of one + feature. It is assumed that input features take on values in the range [0, n_values). - This encoding is needed for feeding ordinal data to many scikit-learn + This encoding is needed for feeding ordinal features to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. Parameters ---------- @@ -2923,10 +2925,16 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Attributes ---------- + active_features_ : array + Indices for active features, meaning values that actually occur + in the training set. All featurs are available when n_values is + ``'auto'``. + feature_indices_ : array of shape (n_features,) Indices to feature ranges. Feature ``i`` in the original data is mapped to features from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` + (and then potentially masked by `active_features_` afterwards) n_values_ : array of shape (n_features,) Maximum number of values per feature. @@ -2935,18 +2943,20 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): -------- Given a dataset with three features and four samples, we let the encoder find the maximum value per feature and transform the data to a binary - Ordinal encoding. + unary encoding. >>> from sklearn.preprocessing import UnaryEncoder >>> enc = UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ [1, 0, 2]]) # doctest: +ELLIPSIS UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values='auto', ordinal_features='all', sparse=True) + n_values='auto', ordinal_features='all', sparse=True) >>> enc.n_values_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) + >>> enc.active_features_ + array([0, 1, 2, 3, 4, 5]) >>> enc.transform([[0, 1, 1]]).toarray() array([[ 0., 1., 0., 1., 0., 0.]]) @@ -2956,6 +2966,8 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate Ordinal encoding of dictionary items or strings. + sklearn.preprocessing.OneHotEncoder: encodes categorical integer features + using a one-hot aka one-of-K scheme. sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all fashion. sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of @@ -3025,6 +3037,10 @@ def _fit_transform(self, X): shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() + if (isinstance(self.n_values, six.string_types) and + self.n_values == 'auto'): + self.active_features_ = np.arange(out.shape[1]) + return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index e715ceacfac25..34246081094fc 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -42,6 +42,7 @@ from sklearn.preprocessing.data import normalize from sklearn.preprocessing.data import OneHotEncoder from sklearn.preprocessing.data import CategoricalEncoder +from sklearn.preprocessing.data import UnaryEncoder from sklearn.preprocessing.data import StandardScaler from sklearn.preprocessing.data import scale from sklearn.preprocessing.data import MinMaxScaler @@ -2224,3 +2225,153 @@ def test_quantile_transform_valid_axis(): assert_raises_regex(ValueError, "axis should be either equal to 0 or 1" ". Got axis=2", quantile_transform, X.T, axis=2) + + +def _generate_random_features_matrix(n_values=3, size=10): + rng = np.random.RandomState(0) + X = rng.randint(n_values, size=(size, n_values)) + return X + + +def test_unary_encoder(): + X = np.arange(5).reshape(-1, 1) + enc = UnaryEncoder(5) + Xt = enc.fit_transform(X).toarray() + assert_array_equal(Xt, [[0, 0, 0, 0], # 0 + [1, 0, 0, 0], # 1 + [1, 1, 0, 0], # 2 + [1, 1, 1, 0], # 3 + [1, 1, 1, 1]]) # 4 + Xt2 = enc.transform(X).toarray() + assert_array_equal(Xt2, Xt) + + +def test_unary_encoder_stack(): + # multiple input features stack to same output + n_values = np.random.randint(2, 10) + size = np.random.randint(1, 10) + + encoder = UnaryEncoder(n_values, sparse=False) + X_multi = _generate_random_features_matrix(n_values, size) + X_multi_t = encoder.fit_transform(X_multi) + assert_equal(X_multi_t.shape, (size, n_values * (n_values - 1))) + + expected = np.hstack([encoder.fit_transform(X_multi[:, i:(i + 1)]) + for i in range(X_multi.shape[1])]) + assert_array_equal(expected, X_multi_t) + + +def test_unary_encoder_dense_sparse(): + # test dense output in comparison to sparse results. + n_values = np.random.randint(1, 10) + size = np.random.randint(1, 10) + + sparse_encoder = UnaryEncoder(n_values) + dense_encoder = UnaryEncoder(n_values, sparse=False) + + X = _generate_random_features_matrix(n_values, size) + X_trans_sparse = sparse_encoder.fit_transform(X) + X_trans_dense = dense_encoder.fit_transform(X) + + assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) + + +def test_unary_encoder_handle_unknown(): + X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) + y = np.array([[4, 1, 1]]) + + # Test that encoder raises error for unknown features. + encoder = UnaryEncoder(handle_unknown='error') + encoder.fit(X) + assert_raises(ValueError, encoder.transform, y) + + # Test the ignore option, ignores unknown features. + encoder = UnaryEncoder(handle_unknown='ignore') + encoder.fit(X) + assert_array_equal( + encoder.transform(y).toarray(), + np.array([[0., 1., 0., 1., 0., 0.]])) + + # Raise error if handle_unknown is neither ignore or error. + encoder = UnaryEncoder(handle_unknown='42') + encoder.fit(X) + assert_raises(ValueError, encoder.transform, y) + + +def test_unary_encoder_errors(): + n_values = np.random.randint(2, 10) + size = np.random.randint(1, 10) + delta = np.random.randint(1, 10) + + encoder = UnaryEncoder(n_values) + X = _generate_random_features_matrix(n_values, size) + encoder.fit(X) + + # test that an error is raised when different shape + larger_n_values = n_values + delta + X_too_large = _generate_random_features_matrix(larger_n_values, size) + assert_raises(ValueError, encoder.transform, X_too_large) + error_msg = ("X has different shape than during fitting." + " Expected {}, got {}.".format(n_values, larger_n_values)) + assert_raises_regex(ValueError, error_msg, encoder.transform, X_too_large) + assert_raises(ValueError, UnaryEncoder(n_values=1).fit_transform, X) + + # test that an error is raised when out of bounds + X[0][0] = n_values + delta + X_out_of_bounds = X + assert_raises(ValueError, encoder.transform, X_out_of_bounds) + error_msg = ("unknown ordinal feature present \[{}\] " + "during transform.".format(n_values + delta)) + assert_raises_regex(ValueError, error_msg, encoder.transform, + X_out_of_bounds) + + # test exception on wrong init param + assert_raises(TypeError, UnaryEncoder(n_values=np.int).fit, X) + + # test negative input to fit + encoder = UnaryEncoder() + assert_raises(ValueError, encoder.fit, [[0], [-1]]) + + # test negative input to transform + encoder.fit([[0], [1]]) + assert_raises(ValueError, encoder.transform, [[0], [-1]]) + + +def test_unary_encoder_edge_cases(): + EDGE_CASES = [ + ( + [[0], [1], [2]], + np.array([[0, 0], [1, 0], [1, 1]]), + ), + ( + [[0], [0], [1]], + np.array([[0], [0], [1]]), + ), + ( + [[0, 0], [0, 0], [0, 1]], + np.array([[0], [0], [1]]), + ), + ] + + for input_matrix, expected_matrix in EDGE_CASES: + transformed = UnaryEncoder().fit_transform(input_matrix) + assert_array_equal(transformed.toarray(), expected_matrix) + + +def test_unary_encoder_n_values_int(): + # Test UnaryEncoder's fit and transform. + n_values = np.random.randint(2, 10) + size = np.random.randint(1, 10) + delta = np.random.randint(1, 10) + + encoder_n_values = n_values + delta + unary_n_values = encoder_n_values - 1 + enc = UnaryEncoder(n_values=encoder_n_values) + + X = _generate_random_features_matrix(n_values, size) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (size, unary_n_values * len(X[0]))) + assert_array_equal( + enc.feature_indices_, + np.arange(0, unary_n_values * len(X[0]) + 1, unary_n_values) + ) diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py index 90994b71b782b..3026451becd3d 100644 --- a/sklearn/utils/testing.py +++ b/sklearn/utils/testing.py @@ -523,7 +523,7 @@ def uninstall_mldata_mock(): 'TfidfVectorizer', 'IsotonicRegression', 'OneHotEncoder', 'RandomTreesEmbedding', 'CategoricalEncoder', 'FeatureHasher', 'DummyClassifier', 'DummyRegressor', - 'TruncatedSVD', 'PolynomialFeatures', + 'TruncatedSVD', 'PolynomialFeatures', 'UnaryEncoder', 'GaussianRandomProjectionHash', 'HashingVectorizer', 'CheckingClassifier', 'PatchExtractor', 'CountVectorizer', # GradientBoosting base estimators, maybe should From eb8bc9459bcf415f690be51851543f1e12bc29d7 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Tue, 28 Nov 2017 19:37:14 -0500 Subject: [PATCH 04/34] Removing active_features_ attribute from UnaryEncoder as it is not needed --- sklearn/preprocessing/data.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 64db56c1fbe68..bf21f23ad8e81 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2925,16 +2925,10 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Attributes ---------- - active_features_ : array - Indices for active features, meaning values that actually occur - in the training set. All featurs are available when n_values is - ``'auto'``. - feature_indices_ : array of shape (n_features,) Indices to feature ranges. Feature ``i`` in the original data is mapped to features from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - (and then potentially masked by `active_features_` afterwards) n_values_ : array of shape (n_features,) Maximum number of values per feature. @@ -2955,8 +2949,6 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) - >>> enc.active_features_ - array([0, 1, 2, 3, 4, 5]) >>> enc.transform([[0, 1, 1]]).toarray() array([[ 0., 1., 0., 1., 0., 0.]]) @@ -3037,10 +3029,6 @@ def _fit_transform(self, X): shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - self.active_features_ = np.arange(out.shape[1]) - return out if self.sparse else out.toarray() def fit_transform(self, X, y=None): From 88d5eb4a596a5eeec594aff6a52c3bfcf4cd197d Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Wed, 29 Nov 2017 08:37:28 -0500 Subject: [PATCH 05/34] Limiting the lines in documentation to less that 80 chars --- doc/modules/preprocessing.rst | 69 ++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 549555e141f40..4e260c33aabb8 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -541,48 +541,51 @@ as a dict, not as scalars. Encoding ordinal features ============================= -Often categorical features have a clear ordering. For example a person could have features +Often categorical features have a clear ordering. For example a person could +have features ``["short", "tall"]``, ``["low income", "medium income", "high income"]``, -``["elementary school graduate", "high school graduate", "some college", "college graduate"]``. -Even though these features can be ordered, we shouldn't necessarily assign scores to them, -as the difference between categories one and two is not the same as the difference -between categories two and three. +``["elementary school graduate", "high school graduate", "some college", + "college graduate"]``. +Even though these features can be ordered, we shouldn't necessarily assign +scores to them, as the difference between categories one and two is not the +same as the difference between categories two and three. One possibility to convert these ordinal features to features that can be used with scikit-learn estimators is to use a unary encoding, which is implemented in :class:`UnaryEncoder`. This estimator transforms each -ordinal feature with ``m`` possible values into ``m - 1`` binary features, where the ith -feature is active if x > i (for i = 0, ... k - 1). +ordinal feature with ``m`` possible values into ``m - 1`` binary features, +where the ith feature is active if x > i (for i = 0, ... k - 1). Continuing the example above:: - >>> enc = preprocessing.UnaryEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values='auto', ordinal_features='all', sparse=True) - >>> enc.transform([[0, 1, 1]]).toarray() - array([[ 0., 1., 0., 1., 0., 0.]]) - -By default, how many values each feature can take is inferred automatically from the dataset. -It is possible to specify this explicitly using the parameter ``n_values``. -There are two genders, three possible continents and four web browsers in our -dataset. -Then we fit the estimator, and transform a data point. -In the result, the first number encodes the height, the next two numbers the income level, -and the next set of three numbers the education level. - -Note that, if there is a possibilty that the training data might have missing categorical -features, one has to explicitly set ``n_values``. For example, - - >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) - >>> # Note that there are missing categorical values for the 2nd and 3rd - >>> # features - >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values=[2, 3, 4], ordinal_features='all', sparse=True) - >>> enc.transform([[1, 1, 2]]).toarray() - array([[ 1., 1., 0., 1., 1., 0.]]) + >>> enc = preprocessing.UnaryEncoder() + >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values='auto', ordinal_features='all', sparse=True) + >>> enc.transform([[0, 1, 1]]).toarray() + array([[ 0., 1., 0., 1., 0., 0.]]) + +By default, how many values each feature can take is inferred automatically +from the dataset. It is possible to specify this explicitly using the parameter +``n_values``. +* There are two genders, three possible continents and four web browsers in our + dataset. +* Then we fit the estimator, and transform a data point. +* In the result, the first number encodes the height, the next two numbers the + income level, and the next set of three numbers the education level. + +Note that, if there is a possibilty that the training data might have missing +categorical features, one has to explicitly set ``n_values``. For example,:: + + >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) + >>> # Note that there are missing categorical values for the 2nd and 3rd + >>> # features + >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + n_values=[2, 3, 4], ordinal_features='all', sparse=True) + >>> enc.transform([[1, 1, 2]]).toarray() + array([[ 1., 1., 0., 1., 1., 0.]]) .. _imputation: From cd21cbfdd2b1d43e1bb73e1f387206fcfe393368 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Wed, 29 Nov 2017 18:38:44 -0500 Subject: [PATCH 06/34] Updated documentation. Changed the default value of sparse parameter to False --- doc/modules/preprocessing.rst | 10 ++++++---- sklearn/preprocessing/data.py | 33 ++++++++++++++------------------- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4e260c33aabb8..cc461afd5c0b9 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -543,10 +543,12 @@ Encoding ordinal features ============================= Often categorical features have a clear ordering. For example a person could have features -``["short", "tall"]``, -``["low income", "medium income", "high income"]``, -``["elementary school graduate", "high school graduate", "some college", - "college graduate"]``. + +* ``["short", "tall"]`` +* ``["low income", "medium income", "high income"]`` +* ``["elementary school graduate", "high school graduate", "some college", + "college graduate"]`` + Even though these features can be ordered, we shouldn't necessarily assign scores to them, as the difference between categories one and two is not the same as the difference between categories two and three. diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bf21f23ad8e81..93a4910b75add 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2885,7 +2885,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): the values taken on by ordinal (discrete) features. The output will be a sparse matrix where each column corresponds to one possible value of one feature. It is assumed that input features take on values in the range - [0, n_values). + 0 to (n_values - 1). This encoding is needed for feeding ordinal features to many scikit-learn estimators, notably linear models and SVMs with the standard kernels. @@ -2916,7 +2916,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): dtype : number type, default=np.float Desired dtype of output. - sparse : boolean, default=True + sparse : boolean, default=False Will return sparse matrix if set True else will return an array. handle_unknown : str, 'error' or 'ignore' @@ -2941,35 +2941,26 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import UnaryEncoder >>> enc = UnaryEncoder() - >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], \ -[1, 0, 2]]) # doctest: +ELLIPSIS + >>> enc.fit([[0, 0, 3], + [1, 1, 0], + [0, 2, 1], + [1, 0, 2]]) # doctest: +ELLIPSIS UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values='auto', ordinal_features='all', sparse=True) + n_values='auto', ordinal_features='all', sparse=False) >>> enc.n_values_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) - >>> enc.transform([[0, 1, 1]]).toarray() - array([[ 0., 1., 0., 1., 0., 0.]]) + >>> enc.transform([[0, 1, 2]]).toarray() + array([[ 0., 1., 0., 1., 1., 0.]]) See also -------- - sklearn.feature_extraction.DictVectorizer : performs a Ordinal encoding of - dictionary items (also handles string-valued features). - sklearn.feature_extraction.FeatureHasher : performs an approximate Ordinal - encoding of dictionary items or strings. sklearn.preprocessing.OneHotEncoder: encodes categorical integer features using a one-hot aka one-of-K scheme. - sklearn.preprocessing.LabelBinarizer : binarizes labels in a one-vs-all - fashion. - sklearn.preprocessing.MultiLabelBinarizer : transforms between iterable of - iterables and a multilabel format, e.g. a (samples x classes) binary - matrix indicating the presence of a class label. - sklearn.preprocessing.LabelEncoder : encodes labels with values between 0 - and n_classes-1. """ def __init__(self, n_values="auto", ordinal_features="all", - dtype=np.float64, sparse=True, handle_unknown='error'): + dtype=np.float64, sparse=False, handle_unknown='error'): self.n_values = n_values self.ordinal_features = ordinal_features self.dtype = dtype @@ -2983,6 +2974,8 @@ def fit(self, X, y=None): ---------- X : array-like, shape [n_samples, n_feature] Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. """ self.fit_transform(X) return self @@ -3088,6 +3081,8 @@ def transform(self, X): ---------- X : array-like, shape [n_samples, n_features] Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. Returns ------- From 81af018d763720470ad650de2c73dee33da5338e Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Wed, 29 Nov 2017 19:34:54 -0500 Subject: [PATCH 07/34] Updated test cases to accomodate change in default value of sparse parameter. Also added a new test case test_unary_encoder_n_values_array --- sklearn/preprocessing/tests/test_data.py | 36 +++++++++++++++++++----- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 34246081094fc..d7d1b8b703000 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2236,13 +2236,13 @@ def _generate_random_features_matrix(n_values=3, size=10): def test_unary_encoder(): X = np.arange(5).reshape(-1, 1) enc = UnaryEncoder(5) - Xt = enc.fit_transform(X).toarray() + Xt = enc.fit_transform(X) assert_array_equal(Xt, [[0, 0, 0, 0], # 0 [1, 0, 0, 0], # 1 [1, 1, 0, 0], # 2 [1, 1, 1, 0], # 3 [1, 1, 1, 1]]) # 4 - Xt2 = enc.transform(X).toarray() + Xt2 = enc.transform(X) assert_array_equal(Xt2, Xt) @@ -2266,8 +2266,8 @@ def test_unary_encoder_dense_sparse(): n_values = np.random.randint(1, 10) size = np.random.randint(1, 10) - sparse_encoder = UnaryEncoder(n_values) - dense_encoder = UnaryEncoder(n_values, sparse=False) + sparse_encoder = UnaryEncoder(n_values, sparse=True) + dense_encoder = UnaryEncoder(n_values) X = _generate_random_features_matrix(n_values, size) X_trans_sparse = sparse_encoder.fit_transform(X) @@ -2289,7 +2289,7 @@ def test_unary_encoder_handle_unknown(): encoder = UnaryEncoder(handle_unknown='ignore') encoder.fit(X) assert_array_equal( - encoder.transform(y).toarray(), + encoder.transform(y), np.array([[0., 1., 0., 1., 0., 0.]])) # Raise error if handle_unknown is neither ignore or error. @@ -2355,11 +2355,11 @@ def test_unary_encoder_edge_cases(): for input_matrix, expected_matrix in EDGE_CASES: transformed = UnaryEncoder().fit_transform(input_matrix) - assert_array_equal(transformed.toarray(), expected_matrix) + assert_array_equal(transformed, expected_matrix) def test_unary_encoder_n_values_int(): - # Test UnaryEncoder's fit and transform. + # Test UnaryEncoder's n_values parameter when set as an int. n_values = np.random.randint(2, 10) size = np.random.randint(1, 10) delta = np.random.randint(1, 10) @@ -2375,3 +2375,25 @@ def test_unary_encoder_n_values_int(): enc.feature_indices_, np.arange(0, unary_n_values * len(X[0]) + 1, unary_n_values) ) + + +def test_unary_encoder_n_values_array(): + # Test UnaryEncoder's n_values parameter when set as an array. + n_features = np.random.randint(2, 10) + size = np.random.randint(1, 10) + delta = np.random.randint(1, 10) + + n_values_array = [n_features] * n_features + enc = UnaryEncoder(n_values=n_values_array) + X = _generate_random_features_matrix(n_features, size) + X_trans = enc.fit_transform(X) + assert_equal(X_trans.shape, (size, sum(n_values_array) - n_features)) + + n_values_array = np.random.randint(2, 10, n_features + delta) + enc = UnaryEncoder(n_values=n_values_array) + X = _generate_random_features_matrix(n_features, size) + assert_raises(ValueError, enc.fit_transform, X) + + enc = UnaryEncoder(n_values=[]) + X = _generate_random_features_matrix(n_features, size) + assert_raises(ValueError, enc.fit_transform, X) From f4ba31003a8a47c0e832ff06c1611c2ca07261d4 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Thu, 30 Nov 2017 20:50:49 -0500 Subject: [PATCH 08/34] Commit to accomodate all the requested changes 1. Fixed test failures 2. Updated docs 3. UnaryEncoder: Changed handle_unknown to handle_greater Updated docs UnaryEncoder: Changed handle_unknown to handle_greater --- doc/modules/preprocessing.rst | 6 ++ sklearn/preprocessing/data.py | 80 +++++++++++++++--------- sklearn/preprocessing/tests/test_data.py | 18 +++--- 3 files changed, 64 insertions(+), 40 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index cc461afd5c0b9..9e925b6a562ac 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -559,6 +559,12 @@ implemented in :class:`UnaryEncoder`. This estimator transforms each ordinal feature with ``m`` possible values into ``m - 1`` binary features, where the ith feature is active if x > i (for i = 0, ... k - 1). +**NOTE**: This encoding is likely to help when used with linear models and +kernel-based models like SVMs with the standard kernels. On the other hand, this +transformation is unlikely to help when using with tree-based models, +since those already work on the basis of a particular feature value being +< or > than a threshold, unlike linear and kernel-based models. + Continuing the example above:: >>> enc = preprocessing.UnaryEncoder() diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 93a4910b75add..d24d649d4a847 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2002,7 +2002,7 @@ def _fit_transform(self, X): except (ValueError, TypeError): raise TypeError("Wrong type for parameter `n_values`. Expected" " 'auto', int or array of ints, got %r" - % self.n_values) + % type(X)) if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") @@ -2881,14 +2881,18 @@ def inverse_transform(self, X): class UnaryEncoder(BaseEstimator, TransformerMixin): """Encode ordinal integer features using a unary scheme. - The input to this transformer should be a matrix of integers, denoting - the values taken on by ordinal (discrete) features. The output will be - a sparse matrix where each column corresponds to one possible value of one - feature. It is assumed that input features take on values in the range + The input to this transformer should be a matrix of non-negative integers, + denoting the values taken on by ordinal (discrete) features. The output + will be a matrix where each column corresponds to one possible value of + one feature. It is assumed that input features take on values in the range 0 to (n_values - 1). This encoding is needed for feeding ordinal features to many scikit-learn - estimators, notably linear models and SVMs with the standard kernels. + estimators, notably linear models and kernel-based models like SVMs with + the standard kernels. + This transformation is unlikely to help when using with tree-based models, + since those already work on the basis of a particular feature value being + < or > than a threshold, unlike linear and kernel-based models. Read more in the :ref:`User Guide `. @@ -2919,9 +2923,10 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=False Will return sparse matrix if set True else will return an array. - handle_unknown : str, 'error' or 'ignore' - Whether to raise an error or ignore if a unknown ordinal feature is - present during transform. + handle_greater : str, 'error' or 'clip' + Whether to raise an error or clip if a greater ordinal feature value is + present during transform as compare to largest feature value seen + during fit. Attributes ---------- @@ -2942,16 +2947,16 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): >>> from sklearn.preprocessing import UnaryEncoder >>> enc = UnaryEncoder() >>> enc.fit([[0, 0, 3], - [1, 1, 0], - [0, 2, 1], - [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', + ... [1, 1, 0], + ... [0, 2, 1], + ... [1, 0, 2]]) # doctest: +ELLIPSIS + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', n_values='auto', ordinal_features='all', sparse=False) >>> enc.n_values_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) - >>> enc.transform([[0, 1, 2]]).toarray() + >>> enc.transform([[0, 1, 2]]) array([[ 0., 1., 0., 1., 1., 0.]]) See also @@ -2960,12 +2965,12 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): using a one-hot aka one-of-K scheme. """ def __init__(self, n_values="auto", ordinal_features="all", - dtype=np.float64, sparse=False, handle_unknown='error'): + dtype=np.float64, sparse=False, handle_greater='error'): self.n_values = n_values self.ordinal_features = ordinal_features self.dtype = dtype self.sparse = sparse - self.handle_unknown = handle_unknown + self.handle_greater = handle_greater def fit(self, X, y=None): """Fit UnaryEncoder to X. @@ -3028,7 +3033,20 @@ def fit_transform(self, X, y=None): """Fit UnaryEncoder to X, then transform X. Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. See fit for the parameters, transform for the return value. + efficient. + + Parameters + ---------- + X : array-like, shape [n_samples, n_feature] + Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. + + Returns + ------- + X_out : sparse matrix or a 2-d array + Transformed input. + """ return _transform_selected(X, self._fit_transform, self.ordinal_features, copy=True) @@ -3046,28 +3064,28 @@ def _transform(self, X): " Expected %d, got %d." % (indices.shape[0] - 1, n_features)) - # We use only those ordinal features of X that are known using fit. - # i.e lesser than n_values_ using mask. - # This means, if self.handle_unknown is "ignore", the row_indices and - # col_indices corresponding to the unknown ordinal feature are - # ignored. + # We clip those ordinal features of X that are greater than n_values_ + # using mask. + # This means, if self.handle_greater is "ignore", the row_indices and + # col_indices corresponding to the greater ordinal feature are all + # filled with ones. mask = (X < self.n_values_).ravel() if np.any(~mask): - if self.handle_unknown not in ['error', 'ignore']: - raise ValueError("handle_unknown should be either 'error' or " - "'ignore' got %s" % self.handle_unknown) - if self.handle_unknown == 'error': + if self.handle_greater not in ['error', 'clip']: + raise ValueError("handle_greater should be either 'error' or " + "'clip' got %s" % self.handle_greater) + if self.handle_greater == 'error': raise ValueError("unknown ordinal feature present %s " "during transform." % X.ravel()[~mask]) - column_start = np.tile(indices[:-1], n_samples)[mask] - column_end = (X + indices[:-1]).ravel()[mask] + X_ceil = np.where(mask.reshape(X.shape), X, self.n_values_ - 1) + column_start = np.tile(indices[:-1], n_samples) + column_end = (indices[:-1] + X_ceil).ravel() column_indices = np.hstack([np.arange(s, e) for s, e in zip(column_start, column_end)]) row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - np.where(mask.reshape(X.shape), X, - 0).sum(axis=1)) - data = np.ones(X.ravel()[mask].sum()) + X_ceil.sum(axis=1)) + data = np.ones(X_ceil.ravel().sum()) out = sparse.coo_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), dtype=self.dtype).tocsr() diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index d7d1b8b703000..b4310e090f478 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2276,24 +2276,24 @@ def test_unary_encoder_dense_sparse(): assert_array_equal(X_trans_sparse.toarray(), X_trans_dense) -def test_unary_encoder_handle_unknown(): - X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]]) +def test_unary_encoder_handle_greater(): + X = np.array([[0, 2, 1], [1, 0, 3], [2, 0, 2]]) y = np.array([[4, 1, 1]]) - # Test that encoder raises error for unknown features. - encoder = UnaryEncoder(handle_unknown='error') + # Test that encoder raises error for greater features. + encoder = UnaryEncoder(handle_greater='error') encoder.fit(X) assert_raises(ValueError, encoder.transform, y) - # Test the ignore option, ignores unknown features. - encoder = UnaryEncoder(handle_unknown='ignore') + # Test the ignore option, clips greater features. + encoder = UnaryEncoder(handle_greater='clip') encoder.fit(X) assert_array_equal( encoder.transform(y), - np.array([[0., 1., 0., 1., 0., 0.]])) + np.array([[1., 1., 1., 0., 1., 0., 0.]])) - # Raise error if handle_unknown is neither ignore or error. - encoder = UnaryEncoder(handle_unknown='42') + # Raise error if handle_greater is neither ignore or error. + encoder = UnaryEncoder(handle_greater='42') encoder.fit(X) assert_raises(ValueError, encoder.transform, y) From 0706c29bb215d59c4b41aa7afb0c02d40654c1e4 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Fri, 1 Dec 2017 01:23:26 -0500 Subject: [PATCH 09/34] Fixing DocTestFailure --- doc/modules/preprocessing.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 9e925b6a562ac..809179f3f9748 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -569,9 +569,9 @@ Continuing the example above:: >>> enc = preprocessing.UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values='auto', ordinal_features='all', sparse=True) - >>> enc.transform([[0, 1, 1]]).toarray() + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + n_values='auto', ordinal_features='all', sparse=False) + >>> enc.transform([[0, 1, 1]]) array([[ 0., 1., 0., 1., 0., 0.]]) By default, how many values each feature can take is inferred automatically @@ -590,9 +590,9 @@ categorical features, one has to explicitly set ``n_values``. For example,:: >>> # Note that there are missing categorical values for the 2nd and 3rd >>> # features >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values=[2, 3, 4], ordinal_features='all', sparse=True) - >>> enc.transform([[1, 1, 2]]).toarray() + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + n_values=[2, 3, 4], ordinal_features='all', sparse=False) + >>> enc.transform([[1, 1, 2]]) array([[ 1., 1., 0., 1., 1., 0.]]) .. _imputation: From b642a7e1f7db6471841457d3d8acafdee8fd1230 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Sun, 3 Dec 2017 00:10:36 -0500 Subject: [PATCH 10/34] Refactoring the code. Now fit_transform in equivalent to fit + transform. Parameters checking being done in fit method. Plus, new test cases. Updated docs Updating implementation of UnaryEncoder checking handle_greater parameter value in fit New test cases --- doc/modules/preprocessing.rst | 12 +++--- sklearn/preprocessing/data.py | 47 +++++++++------------ sklearn/preprocessing/tests/test_data.py | 54 +++++++++++++++++++++--- 3 files changed, 74 insertions(+), 39 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 809179f3f9748..e5633f7c7972b 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -559,11 +559,13 @@ implemented in :class:`UnaryEncoder`. This estimator transforms each ordinal feature with ``m`` possible values into ``m - 1`` binary features, where the ith feature is active if x > i (for i = 0, ... k - 1). -**NOTE**: This encoding is likely to help when used with linear models and -kernel-based models like SVMs with the standard kernels. On the other hand, this -transformation is unlikely to help when using with tree-based models, -since those already work on the basis of a particular feature value being -< or > than a threshold, unlike linear and kernel-based models. +.. note:: + + This encoding is likely to help when used with linear models and + kernel-based models like SVMs with the standard kernels. On the other hand, this + transformation is unlikely to help when using with tree-based models, + since those already work on the basis of a particular feature value being + < or > than a threshold, unlike linear and kernel-based models. Continuing the example above:: diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index d24d649d4a847..d199ba431f0b0 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2924,9 +2924,8 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Will return sparse matrix if set True else will return an array. handle_greater : str, 'error' or 'clip' - Whether to raise an error or clip if a greater ordinal feature value is - present during transform as compare to largest feature value seen - during fit. + Whether to raise an error or clip if an ordinal feature >= n_values is + passed in. Attributes ---------- @@ -2982,15 +2981,19 @@ def fit(self, X, y=None): All feature values should be non-negative otherwise will raise a ValueError. """ - self.fit_transform(X) + _transform_selected(X, self._fit, self.ordinal_features, copy=True) return self - def _fit_transform(self, X): + def _fit(self, X): """Assumes X contains only ordinal features.""" X = check_array(X, dtype=np.int) + if self.handle_greater not in ['error', 'clip']: + raise ValueError("handle_greater should be either 'error' or " + "'clip' got %s" % self.handle_greater) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape + if (isinstance(self.n_values, six.string_types) and self.n_values == 'auto'): n_values = np.max(X, axis=0) + 1 @@ -3016,24 +3019,18 @@ def _fit_transform(self, X): indices = np.cumsum(n_values) self.feature_indices_ = indices - column_start = np.tile(indices[:-1], n_samples) - column_end = (X + indices[:-1]).ravel() - column_indices = np.hstack([np.arange(s, e) for s, e - in zip(column_start, column_end)]) - row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), - X.sum(axis=1)) - data = np.ones(X.sum()) - out = sparse.coo_matrix((data, (row_indices, column_indices)), - shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() + mask = (X < self.n_values_).ravel() + if np.any(~mask): + if self.handle_greater == 'error': + raise ValueError("unknown ordinal feature present %s " + % X.ravel()[~mask]) - return out if self.sparse else out.toarray() + return X def fit_transform(self, X, y=None): """Fit UnaryEncoder to X, then transform X. - Equivalent to self.fit(X).transform(X), but more convenient and more - efficient. + Equivalent to self.fit(X).transform(X), but more convenient. Parameters ---------- @@ -3048,8 +3045,8 @@ def fit_transform(self, X, y=None): Transformed input. """ - return _transform_selected(X, self._fit_transform, - self.ordinal_features, copy=True) + + return self.fit(X).transform(X) def _transform(self, X): """Assumes X contains only ordinal features.""" @@ -3065,15 +3062,11 @@ def _transform(self, X): % (indices.shape[0] - 1, n_features)) # We clip those ordinal features of X that are greater than n_values_ - # using mask. - # This means, if self.handle_greater is "ignore", the row_indices and - # col_indices corresponding to the greater ordinal feature are all - # filled with ones. + # using mask if self.handle_greater is "clip". + # This means, the row_indices and col_indices corresponding to the + # greater ordinal feature are all filled with ones. mask = (X < self.n_values_).ravel() if np.any(~mask): - if self.handle_greater not in ['error', 'clip']: - raise ValueError("handle_greater should be either 'error' or " - "'clip' got %s" % self.handle_greater) if self.handle_greater == 'error': raise ValueError("unknown ordinal feature present %s " "during transform." % X.ravel()[~mask]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index b4310e090f478..caae825de8830 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2280,22 +2280,47 @@ def test_unary_encoder_handle_greater(): X = np.array([[0, 2, 1], [1, 0, 3], [2, 0, 2]]) y = np.array([[4, 1, 1]]) - # Test that encoder raises error for greater features. + # Test that encoder raises error for greater features during transform. encoder = UnaryEncoder(handle_greater='error') encoder.fit(X) assert_raises(ValueError, encoder.transform, y) - # Test the ignore option, clips greater features. + encoder = UnaryEncoder(handle_greater='error') + assert_array_equal(encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Test that encoder raises error for greater features during fit when + # n_values is explicitly set. + encoder = UnaryEncoder(handle_greater='error', n_values=[2, 3, 4]) + assert_raises(ValueError, encoder.fit, X) + + encoder = UnaryEncoder(handle_greater='error', n_values=[2, 3, 4]) + assert_raises(ValueError, encoder.fit_transform, X) + + encoder = UnaryEncoder(handle_greater='error', n_values=[5, 2, 2]) + encoder.fit(y) + assert_array_equal(encoder.transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + encoder = UnaryEncoder(handle_greater='error', n_values=[5, 2, 2]) + assert_array_equal(encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + + # Test the clip option. encoder = UnaryEncoder(handle_greater='clip') encoder.fit(X) assert_array_equal( encoder.transform(y), np.array([[1., 1., 1., 0., 1., 0., 0.]])) - # Raise error if handle_greater is neither ignore or error. + encoder = UnaryEncoder(handle_greater='clip', n_values=[3, 2, 2]) + assert_array_equal( + encoder.fit_transform(y), + np.array([[1., 1., 1., 1.]])) + + # Raise error if handle_greater is neither clip nor error. encoder = UnaryEncoder(handle_greater='42') - encoder.fit(X) - assert_raises(ValueError, encoder.transform, y) + assert_raises(ValueError, encoder.fit, y) def test_unary_encoder_errors(): @@ -2375,6 +2400,10 @@ def test_unary_encoder_n_values_int(): enc.feature_indices_, np.arange(0, unary_n_values * len(X[0]) + 1, unary_n_values) ) + assert_array_equal( + enc.n_values_, + np.array([encoder_n_values] * len(X[0])) + ) def test_unary_encoder_n_values_array(): @@ -2383,17 +2412,28 @@ def test_unary_encoder_n_values_array(): size = np.random.randint(1, 10) delta = np.random.randint(1, 10) - n_values_array = [n_features] * n_features - enc = UnaryEncoder(n_values=n_values_array) + # Test ideal case is working fine X = _generate_random_features_matrix(n_features, size) + n_values_array = list(np.max(X, axis=0) + 1) + enc = UnaryEncoder(n_values=n_values_array) X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (size, sum(n_values_array) - n_features)) + assert_array_equal( + enc.feature_indices_, + np.cumsum(np.array([1] + n_values_array) - 1) + ) + assert_array_equal( + enc.n_values_, + np.array(n_values_array) + ) + # Test that fit_transform raises error when len(n_values) != n_features n_values_array = np.random.randint(2, 10, n_features + delta) enc = UnaryEncoder(n_values=n_values_array) X = _generate_random_features_matrix(n_features, size) assert_raises(ValueError, enc.fit_transform, X) + # Test that fit_transform raises error when len(n_values) != n_features enc = UnaryEncoder(n_values=[]) X = _generate_random_features_matrix(n_features, size) assert_raises(ValueError, enc.fit_transform, X) From 367dba4cafca50cc36efbfd49ec3ee1e2e14573a Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Sun, 3 Dec 2017 19:22:27 -0500 Subject: [PATCH 11/34] Minor change in mask calculation. Removing fit_transform for UnaryEncoder and relying on one defined in TransformerMixin Update handle_greater=error error message Removing fit_transform for UnaryEncoder and relying on one defined in TransformerMixin Update handle_greater=error error message --- sklearn/preprocessing/data.py | 37 ++++++++--------------------------- 1 file changed, 8 insertions(+), 29 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index d199ba431f0b0..28727d4302bf2 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -3019,35 +3019,14 @@ def _fit(self, X): indices = np.cumsum(n_values) self.feature_indices_ = indices - mask = (X < self.n_values_).ravel() - if np.any(~mask): + mask = (X >= self.n_values_).ravel() + if np.any(mask): if self.handle_greater == 'error': - raise ValueError("unknown ordinal feature present %s " - % X.ravel()[~mask]) + raise ValueError("handle_greater='error' but %d feature values" + " exceed n_values" % np.count_nonzero(mask)) return X - def fit_transform(self, X, y=None): - """Fit UnaryEncoder to X, then transform X. - - Equivalent to self.fit(X).transform(X), but more convenient. - - Parameters - ---------- - X : array-like, shape [n_samples, n_feature] - Input array of type int. - All feature values should be non-negative otherwise will raise a - ValueError. - - Returns - ------- - X_out : sparse matrix or a 2-d array - Transformed input. - - """ - - return self.fit(X).transform(X) - def _transform(self, X): """Assumes X contains only ordinal features.""" X = check_array(X, dtype=np.int) @@ -3065,13 +3044,13 @@ def _transform(self, X): # using mask if self.handle_greater is "clip". # This means, the row_indices and col_indices corresponding to the # greater ordinal feature are all filled with ones. - mask = (X < self.n_values_).ravel() - if np.any(~mask): + mask = (X >= self.n_values_).ravel() + if np.any(mask): if self.handle_greater == 'error': raise ValueError("unknown ordinal feature present %s " - "during transform." % X.ravel()[~mask]) + "during transform." % X.ravel()[mask]) - X_ceil = np.where(mask.reshape(X.shape), X, self.n_values_ - 1) + X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X) column_start = np.tile(indices[:-1], n_samples) column_end = (indices[:-1] + X_ceil).ravel() column_indices = np.hstack([np.arange(s, e) for s, e From 9f3205dfcef9fe6ba56dc1fb982da9f1db66f057 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Mon, 4 Dec 2017 00:38:57 -0500 Subject: [PATCH 12/34] Adding warn as a new option for handle_greater parameter. Updaing warn mode Making warn as default mode --- sklearn/preprocessing/data.py | 28 ++++++++++++++---------- sklearn/preprocessing/tests/test_data.py | 2 +- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 28727d4302bf2..3296ae53bfd82 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2923,9 +2923,9 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=False Will return sparse matrix if set True else will return an array. - handle_greater : str, 'error' or 'clip' - Whether to raise an error or clip if an ordinal feature >= n_values is - passed in. + handle_greater : str, 'warn' or 'error' or 'clip', default='warn' + Whether to raise an error or clip or warn if an + ordinal feature >= n_values is passed in. Attributes ---------- @@ -2949,7 +2949,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): ... [1, 1, 0], ... [0, 2, 1], ... [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', n_values='auto', ordinal_features='all', sparse=False) >>> enc.n_values_ array([2, 3, 4]) @@ -2964,7 +2964,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): using a one-hot aka one-of-K scheme. """ def __init__(self, n_values="auto", ordinal_features="all", - dtype=np.float64, sparse=False, handle_greater='error'): + dtype=np.float64, sparse=False, handle_greater='warn'): self.n_values = n_values self.ordinal_features = ordinal_features self.dtype = dtype @@ -2987,9 +2987,9 @@ def fit(self, X, y=None): def _fit(self, X): """Assumes X contains only ordinal features.""" X = check_array(X, dtype=np.int) - if self.handle_greater not in ['error', 'clip']: - raise ValueError("handle_greater should be either 'error' or " - "'clip' got %s" % self.handle_greater) + if self.handle_greater not in ['warn', 'error', 'clip']: + raise ValueError("handle_greater should be either 'warn', 'error' " + "or 'clip' got %s" % self.handle_greater) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") n_samples, n_features = X.shape @@ -3023,7 +3023,7 @@ def _fit(self, X): if np.any(mask): if self.handle_greater == 'error': raise ValueError("handle_greater='error' but %d feature values" - " exceed n_values" % np.count_nonzero(mask)) + " exceed n_values." % np.count_nonzero(mask)) return X @@ -3046,9 +3046,13 @@ def _transform(self, X): # greater ordinal feature are all filled with ones. mask = (X >= self.n_values_).ravel() if np.any(mask): - if self.handle_greater == 'error': - raise ValueError("unknown ordinal feature present %s " - "during transform." % X.ravel()[mask]) + if self.handle_greater == 'warn': + warnings.warn("Found feature values which " + "exceeds n_values during transform.") + elif self.handle_greater == 'error': + raise ValueError("Found feature values %s which exceeds " + "n_values during transform." + % X.ravel()[mask]) X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X) column_start = np.tile(indices[:-1], n_samples) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index caae825de8830..11a7948d138ae 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2345,7 +2345,7 @@ def test_unary_encoder_errors(): X[0][0] = n_values + delta X_out_of_bounds = X assert_raises(ValueError, encoder.transform, X_out_of_bounds) - error_msg = ("unknown ordinal feature present \[{}\] " + error_msg = ("Found feature values \[{}\] which exceeds n_values " "during transform.".format(n_values + delta)) assert_raises_regex(ValueError, error_msg, encoder.transform, X_out_of_bounds) From c23ec8d9290aa6cab5d70133bace50c1543e3e4c Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Mon, 4 Dec 2017 01:02:36 -0500 Subject: [PATCH 13/34] Updating test case to take care of new handle_greater='warn' as default and updating docs Updating docs --- doc/modules/preprocessing.rst | 4 ++-- sklearn/preprocessing/tests/test_data.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e5633f7c7972b..1cf46e69695bb 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -571,7 +571,7 @@ Continuing the example above:: >>> enc = preprocessing.UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', n_values='auto', ordinal_features='all', sparse=False) >>> enc.transform([[0, 1, 1]]) array([[ 0., 1., 0., 1., 0., 0.]]) @@ -592,7 +592,7 @@ categorical features, one has to explicitly set ``n_values``. For example,:: >>> # Note that there are missing categorical values for the 2nd and 3rd >>> # features >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='error', + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', n_values=[2, 3, 4], ordinal_features='all', sparse=False) >>> enc.transform([[1, 1, 2]]) array([[ 1., 1., 0., 1., 1., 0.]]) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 11a7948d138ae..3627a1e0967e6 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2342,6 +2342,9 @@ def test_unary_encoder_errors(): assert_raises(ValueError, UnaryEncoder(n_values=1).fit_transform, X) # test that an error is raised when out of bounds + encoder = UnaryEncoder(n_values, handle_greater='error') + X = _generate_random_features_matrix(n_values, size) + encoder.fit(X) X[0][0] = n_values + delta X_out_of_bounds = X assert_raises(ValueError, encoder.transform, X_out_of_bounds) From 9d4753a7ddefdcd0da9a2d4fd0e5f85ea0e71b98 Mon Sep 17 00:00:00 2001 From: Arjun Jauhari Date: Mon, 4 Dec 2017 20:17:56 -0500 Subject: [PATCH 14/34] Fixing concerns. Explaining handle_greater options Adding test cases to test warn option of handle_greater parameter Updated warning message updated feature_matrix generation function and made test deterministic making test cases clearer by using n_features n_values and n_features cleanup --- sklearn/preprocessing/data.py | 26 ++++--- sklearn/preprocessing/tests/test_data.py | 93 +++++++++++++++--------- 2 files changed, 73 insertions(+), 46 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 3296ae53bfd82..a87d389d0d916 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -2923,10 +2923,15 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sparse : boolean, default=False Will return sparse matrix if set True else will return an array. - handle_greater : str, 'warn' or 'error' or 'clip', default='warn' + handle_greater : str, 'warn' or 'error' or 'clip' Whether to raise an error or clip or warn if an ordinal feature >= n_values is passed in. + - 'warn' (default): same as clip but with warning. + - 'error': raise error if feature >= n_values is passed in. + - 'clip': all the feature values >= n_values are clipped to + (n_values-1) during transform. + Attributes ---------- feature_indices_ : array of shape (n_features,) @@ -2998,9 +3003,6 @@ def _fit(self, X): self.n_values == 'auto'): n_values = np.max(X, axis=0) + 1 elif isinstance(self.n_values, numbers.Integral): - if (np.max(X, axis=0) >= self.n_values).any(): - raise ValueError("Feature out of bounds for n_values=%d" - % self.n_values) n_values = np.empty(n_features, dtype=np.int) n_values.fill(self.n_values) else: @@ -3022,8 +3024,9 @@ def _fit(self, X): mask = (X >= self.n_values_).ravel() if np.any(mask): if self.handle_greater == 'error': - raise ValueError("handle_greater='error' but %d feature values" - " exceed n_values." % np.count_nonzero(mask)) + raise ValueError("handle_greater='error' but found %d feature" + " values which exceeds n_values." + % np.count_nonzero(mask)) return X @@ -3047,12 +3050,13 @@ def _transform(self, X): mask = (X >= self.n_values_).ravel() if np.any(mask): if self.handle_greater == 'warn': - warnings.warn("Found feature values which " - "exceeds n_values during transform.") + warnings.warn("Found %d feature values which exceeds " + "n_values during transform, clipping them." + % np.count_nonzero(mask)) elif self.handle_greater == 'error': - raise ValueError("Found feature values %s which exceeds " - "n_values during transform." - % X.ravel()[mask]) + raise ValueError("handle_greater='error' but found %d feature" + " values which exceeds n_values during " + "transform." % np.count_nonzero(mask)) X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X) column_start = np.tile(indices[:-1], n_samples) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 3627a1e0967e6..d94a761a911ad 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -2227,9 +2227,10 @@ def test_quantile_transform_valid_axis(): ". Got axis=2", quantile_transform, X.T, axis=2) -def _generate_random_features_matrix(n_values=3, size=10): - rng = np.random.RandomState(0) - X = rng.randint(n_values, size=(size, n_values)) +def _generate_random_features_matrix(n_samples=10, n_features=3, + n_values_max=3): + rng = np.random.RandomState(6) + X = rng.randint(n_values_max, size=(n_samples, n_features)) return X @@ -2248,13 +2249,13 @@ def test_unary_encoder(): def test_unary_encoder_stack(): # multiple input features stack to same output - n_values = np.random.randint(2, 10) - size = np.random.randint(1, 10) + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) encoder = UnaryEncoder(n_values, sparse=False) - X_multi = _generate_random_features_matrix(n_values, size) + X_multi = _generate_random_features_matrix(size, n_features, n_values) X_multi_t = encoder.fit_transform(X_multi) - assert_equal(X_multi_t.shape, (size, n_values * (n_values - 1))) + assert_equal(X_multi_t.shape, (size, n_features * (n_values - 1))) expected = np.hstack([encoder.fit_transform(X_multi[:, i:(i + 1)]) for i in range(X_multi.shape[1])]) @@ -2263,13 +2264,13 @@ def test_unary_encoder_stack(): def test_unary_encoder_dense_sparse(): # test dense output in comparison to sparse results. - n_values = np.random.randint(1, 10) - size = np.random.randint(1, 10) + n_values = rng.randint(1, 10) + size = rng.randint(1, 10) sparse_encoder = UnaryEncoder(n_values, sparse=True) dense_encoder = UnaryEncoder(n_values) - X = _generate_random_features_matrix(n_values, size) + X = _generate_random_features_matrix(size, n_features, n_values) X_trans_sparse = sparse_encoder.fit_transform(X) X_trans_dense = dense_encoder.fit_transform(X) @@ -2318,38 +2319,60 @@ def test_unary_encoder_handle_greater(): encoder.fit_transform(y), np.array([[1., 1., 1., 1.]])) + # Test the warn option. + encoder = UnaryEncoder() + encoder.fit(X) + w = ('Found 1 feature values which exceeds n_values during transform, ' + 'clipping them.') + y_transformed = assert_warns_message(UserWarning, w, encoder.transform, y) + assert_array_equal( + y_transformed, + np.array([[1., 1., 1., 0., 1., 0., 0.]])) + + encoder = UnaryEncoder(n_values=[3, 2, 2]) + y_transformed = assert_warns_message(UserWarning, w, + encoder.fit_transform, y) + assert_array_equal( + y_transformed, + np.array([[1., 1., 1., 1.]])) + + encoder = UnaryEncoder(n_values=[5, 2, 2]) + assert_array_equal( + encoder.fit_transform(y), + np.array([[1., 1., 1., 1., 1., 1.]])) + # Raise error if handle_greater is neither clip nor error. encoder = UnaryEncoder(handle_greater='42') assert_raises(ValueError, encoder.fit, y) def test_unary_encoder_errors(): - n_values = np.random.randint(2, 10) - size = np.random.randint(1, 10) - delta = np.random.randint(1, 10) + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) encoder = UnaryEncoder(n_values) - X = _generate_random_features_matrix(n_values, size) + X = _generate_random_features_matrix(size, n_features, n_values) encoder.fit(X) # test that an error is raised when different shape - larger_n_values = n_values + delta - X_too_large = _generate_random_features_matrix(larger_n_values, size) + larger_n_features = n_features + delta + X_too_large = _generate_random_features_matrix(size, larger_n_features, + n_values) assert_raises(ValueError, encoder.transform, X_too_large) error_msg = ("X has different shape than during fitting." - " Expected {}, got {}.".format(n_values, larger_n_values)) + " Expected {}, got {}.".format(n_features, larger_n_features)) assert_raises_regex(ValueError, error_msg, encoder.transform, X_too_large) - assert_raises(ValueError, UnaryEncoder(n_values=1).fit_transform, X) # test that an error is raised when out of bounds encoder = UnaryEncoder(n_values, handle_greater='error') - X = _generate_random_features_matrix(n_values, size) + X = _generate_random_features_matrix(size, n_features, n_values) encoder.fit(X) X[0][0] = n_values + delta X_out_of_bounds = X assert_raises(ValueError, encoder.transform, X_out_of_bounds) - error_msg = ("Found feature values \[{}\] which exceeds n_values " - "during transform.".format(n_values + delta)) + error_msg = ("handle_greater='error' but found 1 feature values which " + "exceeds n_values during transform.") assert_raises_regex(ValueError, error_msg, encoder.transform, X_out_of_bounds) @@ -2388,35 +2411,35 @@ def test_unary_encoder_edge_cases(): def test_unary_encoder_n_values_int(): # Test UnaryEncoder's n_values parameter when set as an int. - n_values = np.random.randint(2, 10) - size = np.random.randint(1, 10) - delta = np.random.randint(1, 10) + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) encoder_n_values = n_values + delta unary_n_values = encoder_n_values - 1 enc = UnaryEncoder(n_values=encoder_n_values) - X = _generate_random_features_matrix(n_values, size) + X = _generate_random_features_matrix(size, n_features, n_values) X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (size, unary_n_values * len(X[0]))) + assert_equal(X_trans.shape, (size, unary_n_values * n_features)) assert_array_equal( enc.feature_indices_, - np.arange(0, unary_n_values * len(X[0]) + 1, unary_n_values) + np.arange(0, (unary_n_values * n_features) + 1, unary_n_values) ) assert_array_equal( enc.n_values_, - np.array([encoder_n_values] * len(X[0])) + np.array([encoder_n_values] * n_features) ) def test_unary_encoder_n_values_array(): # Test UnaryEncoder's n_values parameter when set as an array. - n_features = np.random.randint(2, 10) - size = np.random.randint(1, 10) - delta = np.random.randint(1, 10) + n_values = rng.randint(2, 10) + size = rng.randint(1, 10) + delta = rng.randint(1, 10) # Test ideal case is working fine - X = _generate_random_features_matrix(n_features, size) + X = _generate_random_features_matrix(size, n_features, n_values) n_values_array = list(np.max(X, axis=0) + 1) enc = UnaryEncoder(n_values=n_values_array) X_trans = enc.fit_transform(X) @@ -2431,12 +2454,12 @@ def test_unary_encoder_n_values_array(): ) # Test that fit_transform raises error when len(n_values) != n_features - n_values_array = np.random.randint(2, 10, n_features + delta) + n_values_array = rng.randint(2, 10, n_features + delta) enc = UnaryEncoder(n_values=n_values_array) - X = _generate_random_features_matrix(n_features, size) + X = _generate_random_features_matrix(size, n_features, n_values) assert_raises(ValueError, enc.fit_transform, X) # Test that fit_transform raises error when len(n_values) != n_features enc = UnaryEncoder(n_values=[]) - X = _generate_random_features_matrix(n_features, size) + X = _generate_random_features_matrix(size, n_features, n_values) assert_raises(ValueError, enc.fit_transform, X) From e2a01bbeceae49f0d2f02c4d1bd1f6a552382a12 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Dec 2018 16:59:41 -0500 Subject: [PATCH 15/34] some doc modif, TBC --- doc/modules/preprocessing.rst | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 6c1ba8bfc3c45..4db285350ee48 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -547,33 +547,26 @@ represented as a dict, not as scalars. Encoding ordinal features ========================= -Often categorical features have a clear ordering. For example a person could -have features - -* ``["short", "tall"]`` -* ``["low income", "medium income", "high income"]`` -* ``["elementary school graduate", "high school graduate", "some college", - "college graduate"]`` - -Even though these features can be ordered, we shouldn't necessarily assign -scores to them, as the difference between categories one and two is not the -same as the difference between categories two and three. +Often, categorical features have a clear ordering. But even though some +categories can be ordered, we shouldn't necessarily assign them to numerical +values, as the difference between categories one and two may not be the same +as the difference between categories two and three (for example). One possibility to convert these ordinal features to features that can be used with scikit-learn estimators is to use a unary encoding, which is -implemented in :class:`UnaryEncoder`. This estimator transforms each +implemented in :class:`UnaryEncoder`. This estimator transforms each ordinal feature with ``m`` possible values into ``m - 1`` binary features, -where the ith feature is active if x > i (for i = 0, ... k - 1). +where the ith feature is active if x > i. .. note:: This encoding is likely to help when used with linear models and - kernel-based models like SVMs with the standard kernels. On the other hand, this - transformation is unlikely to help when using with tree-based models, - since those already work on the basis of a particular feature value being - < or > than a threshold, unlike linear and kernel-based models. + kernel-based models like SVMs with the standard kernels. On the other + hand, this transformation is unlikely to help when using tree-based + models, since those already work on the basis of a particular feature + value being less or bigger than a threshold. -Continuing the example above:: +For example:: >>> enc = preprocessing.UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS @@ -582,9 +575,9 @@ Continuing the example above:: >>> enc.transform([[0, 1, 1]]) array([[ 0., 1., 0., 1., 0., 0.]]) -By default, how many values each feature can take is inferred automatically -from the dataset. It is possible to specify this explicitly using the parameter -``n_values``. +By default, the number of categories in a feature is inferred automatically +from the dataset by looking for the maximum value. It is possible to specify +this explicitly using the parameter ``n_values``. * There are two genders, three possible continents and four web browsers in our dataset. From fc6a9af70dbd97fd85b58bfa4b94371ee1f0d77b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Dec 2018 17:07:28 -0500 Subject: [PATCH 16/34] removed ordinal_features --- sklearn/preprocessing/_encoders.py | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index b0d2d753fd1fa..a41ec32b149fe 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -880,15 +880,6 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): ``X[:, i]``. Each feature value should be in ``range(n_values[i])`` - ordinal_features : "all" or array of indices or mask - Specify what features are treated as ordinal. - - - 'all' (default): All features are treated as ordinal. - - array of indices: Array of ordinal feature indices. - - mask: Array of length n_features and with dtype=bool. - - Non-ordinal features are always stacked to the right of the matrix. - dtype : number type, default=np.float Desired dtype of output. @@ -940,10 +931,9 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sklearn.preprocessing.OneHotEncoder: encodes categorical integer features using a one-hot aka one-of-K scheme. """ - def __init__(self, n_values="auto", ordinal_features="all", - dtype=np.float64, sparse=False, handle_greater='warn'): + def __init__(self, n_values="auto", dtype=np.float64, sparse=False, + handle_greater='warn'): self.n_values = n_values - self.ordinal_features = ordinal_features self.dtype = dtype self.sparse = sparse self.handle_greater = handle_greater @@ -958,7 +948,8 @@ def fit(self, X, y=None): All feature values should be non-negative otherwise will raise a ValueError. """ - _transform_selected(X, self._fit, self.ordinal_features, copy=True) + _transform_selected(X, self._fit, dtype=self.dtype, selected='all', + copy=True) return self def _fit(self, X): @@ -1059,5 +1050,6 @@ def transform(self, X): X_out : sparse matrix if sparse=True else a 2-d array, dtype=int Transformed input. """ - return _transform_selected(X, self._transform, - self.ordinal_features, copy=True) \ No newline at end of file + return _transform_selected(X, self._transform, self.dtype, + selected='all', + copy=True) From 645f6a3adfac51384256cd03131a453f3e1c7ec4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Dec 2018 17:10:43 -0500 Subject: [PATCH 17/34] Removed calls to _transform_selected --- sklearn/preprocessing/_encoders.py | 43 +++++++++++------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index a41ec32b149fe..9471cc20d9fb1 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -948,12 +948,6 @@ def fit(self, X, y=None): All feature values should be non-negative otherwise will raise a ValueError. """ - _transform_selected(X, self._fit, dtype=self.dtype, selected='all', - copy=True) - return self - - def _fit(self, X): - """Assumes X contains only ordinal features.""" X = check_array(X, dtype=np.int) if self.handle_greater not in ['warn', 'error', 'clip']: raise ValueError("handle_greater should be either 'warn', 'error' " @@ -990,11 +984,23 @@ def _fit(self, X): raise ValueError("handle_greater='error' but found %d feature" " values which exceeds n_values." % np.count_nonzero(mask)) + return self - return X + def transform(self, X): + """Transform X using Ordinal encoding. - def _transform(self, X): - """Assumes X contains only ordinal features.""" + Parameters + ---------- + X : array-like, shape [n_samples, n_features] + Input array of type int. + All feature values should be non-negative otherwise will raise a + ValueError. + + Returns + ------- + X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + Transformed input. + """ X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") @@ -1034,22 +1040,3 @@ def _transform(self, X): dtype=self.dtype).tocsr() return out if self.sparse else out.toarray() - - def transform(self, X): - """Transform X using Ordinal encoding. - - Parameters - ---------- - X : array-like, shape [n_samples, n_features] - Input array of type int. - All feature values should be non-negative otherwise will raise a - ValueError. - - Returns - ------- - X_out : sparse matrix if sparse=True else a 2-d array, dtype=int - Transformed input. - """ - return _transform_selected(X, self._transform, self.dtype, - selected='all', - copy=True) From 4691026b277f5b09436028accbe1a7611f5b1333 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sat, 29 Dec 2018 17:42:48 -0500 Subject: [PATCH 18/34] Addressed Joris' comments --- sklearn/preprocessing/_encoders.py | 41 +++++++++++++++++------------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 9471cc20d9fb1..3cc5d3890eed7 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -265,6 +265,8 @@ class OneHotEncoder(_BaseEncoder): -------- sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) encoding of the categorical features. + sklearn.preprocessing.UnaryEncoder: performs a unary encoding of ordinal + data. sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of dictionary items (also handles string-valued features). sklearn.feature_extraction.FeatureHasher : performs an approximate one-hot @@ -768,6 +770,8 @@ class OrdinalEncoder(_BaseEncoder): -------- sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of categorical features. + sklearn.preprocessing.UnaryEncoder: performs a unary encoding of ordinal + data. sklearn.preprocessing.LabelEncoder : encodes target labels with values between 0 and n_classes-1. """ @@ -870,7 +874,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - n_values : 'auto', int or array of ints + n_values : 'auto', int or array of ints, optional (default='auto') Number of values per feature. - 'auto' : determine value range from training data. @@ -880,26 +884,25 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): ``X[:, i]``. Each feature value should be in ``range(n_values[i])`` - dtype : number type, default=np.float + dtype : number type, optional (default=np.float) Desired dtype of output. - sparse : boolean, default=False + sparse : boolean, optional (default=False) Will return sparse matrix if set True else will return an array. - handle_greater : str, 'warn' or 'error' or 'clip' + handle_greater : str, 'warn', 'error' or 'clip', optional (default='warn') Whether to raise an error or clip or warn if an ordinal feature >= n_values is passed in. - - 'warn' (default): same as clip but with warning. - 'error': raise error if feature >= n_values is passed in. - 'clip': all the feature values >= n_values are clipped to - (n_values-1) during transform. + (n_values-1) during transform. + - 'warn': same as clip but with warning. Attributes ---------- - feature_indices_ : array of shape (n_features,) - Indices to feature ranges. - Feature ``i`` in the original data is mapped to features + feature_indices_ : array of shape (n_features + 1,) + Feature ``i`` in the original data is mapped to columns from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` n_values_ : array of shape (n_features,) @@ -930,6 +933,8 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): -------- sklearn.preprocessing.OneHotEncoder: encodes categorical integer features using a one-hot aka one-of-K scheme. + sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) + encoding of the categorical features. """ def __init__(self, n_values="auto", dtype=np.float64, sparse=False, handle_greater='warn'): @@ -943,7 +948,7 @@ def fit(self, X, y=None): Parameters ---------- - X : array-like, shape [n_samples, n_feature] + X : array-like of shape (n_samples, n_feature) Input array of type int. All feature values should be non-negative otherwise will raise a ValueError. @@ -978,9 +983,9 @@ def fit(self, X, y=None): indices = np.cumsum(n_values) self.feature_indices_ = indices - mask = (X >= self.n_values_).ravel() - if np.any(mask): - if self.handle_greater == 'error': + if self.n_values != 'auto' and self.handle_greater == 'error': + mask = (X >= self.n_values_).ravel() + if np.any(mask): raise ValueError("handle_greater='error' but found %d feature" " values which exceeds n_values." % np.count_nonzero(mask)) @@ -991,10 +996,10 @@ def transform(self, X): Parameters ---------- - X : array-like, shape [n_samples, n_features] + X : array-like, of shape (n_samples, n_features) Input array of type int. - All feature values should be non-negative otherwise will raise a - ValueError. + All feature values should be non-negative otherwise ValueError + will be raised. Returns ------- @@ -1035,8 +1040,8 @@ def transform(self, X): row_indices = np.repeat(np.arange(n_samples, dtype=np.int32), X_ceil.sum(axis=1)) data = np.ones(X_ceil.ravel().sum()) - out = sparse.coo_matrix((data, (row_indices, column_indices)), + out = sparse.csr_matrix((data, (row_indices, column_indices)), shape=(n_samples, indices[-1]), - dtype=self.dtype).tocsr() + dtype=self.dtype) return out if self.sparse else out.toarray() From 35cbbe435bd3956dc38881582c454cbfd05d57b0 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 30 Dec 2018 15:51:55 -0500 Subject: [PATCH 19/34] Added inverse tranform --- sklearn/preprocessing/_encoders.py | 49 +++++++++++++++++++- sklearn/preprocessing/tests/test_encoders.py | 27 ++++++++++- 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 3cc5d3890eed7..ad72f2f560606 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -959,7 +959,7 @@ def fit(self, X, y=None): "or 'clip' got %s" % self.handle_greater) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") - n_samples, n_features = X.shape + _, n_features = X.shape if (isinstance(self.n_values, six.string_types) and self.n_values == 'auto'): @@ -1003,9 +1003,10 @@ def transform(self, X): Returns ------- - X_out : sparse matrix if sparse=True else a 2-d array, dtype=int + X_tr: sparse matrix if sparse=True else a 2-d array Transformed input. """ + check_is_fitted(self, 'n_values_') X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") @@ -1045,3 +1046,47 @@ def transform(self, X): dtype=self.dtype) return out if self.sparse else out.toarray() + + def inverse_transform(self, X): + """Convert the data back to the original representation. + + Parameters + ---------- + X : array-like or sparse matrix of shape \ + (n_samples, n_encoded_features) + The transformed data. + + Returns + ------- + X_tr : array-like of shape (n_samples, n_features) + Inverse transformed array. + """ + + check_is_fitted(self, 'n_values_') + X = check_array(X, accept_sparse='csr', ensure_min_features=0) + + n_samples, _ = X.shape + n_features = len(self.n_values_) + n_encoded_features = self.feature_indices_[-1] + + # validate shape of passed X + msg = ("Shape of the passed X data is not correct. Expected {0} " + "columns, got {1}.") + if X.shape[1] != n_encoded_features: + raise ValueError(msg.format(n_encoded_features, X.shape[1])) + + # return float dtype, even though it will contain int values + X_tr = np.zeros((n_samples, n_features), dtype=np.float) + + j = 0 + for i in range(n_features): + n_columns = self.n_values_[i] - 1 + + sub = X[:, j:j + n_columns] + + categories = sub.sum(axis=1).ravel() + X_tr[:, i] = categories + + j += n_columns + + return X_tr diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index bd9455280797a..f91447b71b7fd 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -924,4 +924,29 @@ def test_unary_encoder_n_values_array(): # Test that fit_transform raises error when len(n_values) != n_features enc = UnaryEncoder(n_values=[]) X = _generate_random_features_matrix(size, n_features, n_values) - assert_raises(ValueError, enc.fit_transform, X) \ No newline at end of file + assert_raises(ValueError, enc.fit_transform, X) + + +@pytest.mark.parametrize('sparse_', (True, False)) +@pytest.mark.parametrize('X', ( + [[0], [0]], # only one category (transformed into []) + [[1], [1]], # only one category but implicitely 2 + [[1, 0], [1, 1], [0, 1], [0, 2]] # multiple categories +)) +def test_unary_encoder_inverse_transform(sparse_, X): + enc = UnaryEncoder(sparse=sparse_) + assert_array_equal(X, enc.inverse_transform(enc.fit_transform(X))) + + +def test_unary_encoder_inverse_transform_input(): + X = [[1, 0], # will be transformed into 1 + 2 = 3 columns + [1, 1], + [0, 1], + [0, 2]] + enc = UnaryEncoder().fit(X) + bad_X_tr = [[1, 1, 1, 0]] # 4 columns + assert_raises_regex( + ValueError, + "Shape of the passed X data is not correct. Expected 3 columns, got 4", + enc.inverse_transform, bad_X_tr + ) From b4609fe13fde0fc17a131e8af699fadeff313cd9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 30 Dec 2018 16:21:43 -0500 Subject: [PATCH 20/34] Updated user guide --- doc/modules/preprocessing.rst | 76 +++++++++++++++--------------- sklearn/preprocessing/_encoders.py | 37 ++++++++------- 2 files changed, 60 insertions(+), 53 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4db285350ee48..943ea946ad73d 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -451,6 +451,10 @@ The normalizer instance can then be used on sample vectors as any transformer:: Encoding categorical features ============================= + +Ordinal encoding +---------------- + Often features are not given as continuous values but categorical. For example a person could have features ``["male", "female"]``, ``["from Europe", "from US", "from Asia"]``, @@ -476,6 +480,9 @@ scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of browsers was ordered arbitrarily). +One-hot encoding +---------------- + Another possibility to convert categorical features to features that can be used with scikit-learn estimators is to use a one-of-K, also known as one-hot or dummy encoding. @@ -542,60 +549,55 @@ columns for this feature will be all zeros See :ref:`dict_feature_extraction` for categorical features that are represented as a dict, not as scalars. -.. _preprocessing_ordinal_features: +.. _unary_encoding: -Encoding ordinal features -========================= +Unary encoding +-------------- -Often, categorical features have a clear ordering. But even though some -categories can be ordered, we shouldn't necessarily assign them to numerical -values, as the difference between categories one and two may not be the same -as the difference between categories two and three (for example). - -One possibility to convert these ordinal features to features that can be used -with scikit-learn estimators is to use a unary encoding, which is -implemented in :class:`UnaryEncoder`. This estimator transforms each -ordinal feature with ``m`` possible values into ``m - 1`` binary features, -where the ith feature is active if x > i. - -.. note:: - - This encoding is likely to help when used with linear models and - kernel-based models like SVMs with the standard kernels. On the other - hand, this transformation is unlikely to help when using tree-based - models, since those already work on the basis of a particular feature - value being less or bigger than a threshold. +For some ordinal features, it does not necessarily make sense to use +:class:`OrdinalEncoder` if the difference between the ordered categories is +un-even, for example with a feature that takes values in "very short", +"short", "big". -For example:: +For such features, it is possible to use a unary encoding, which is +implemented in :class:`UnaryEncoder`. This encoder transforms each ordinal +feature with ``m`` possible values into ``m - 1`` binary features, where the +ith feature is active if x > i. For example:: >>> enc = preprocessing.UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', - n_values='auto', ordinal_features='all', sparse=False) - >>> enc.transform([[0, 1, 1]]) - array([[ 0., 1., 0., 1., 0., 0.]]) + n_values='auto', sparse=False) + >>> enc.transform([[0, 1, 3]]) + array([[0., 1., 0., 1., 1., 1.]]) + +Here the first feature with 2 categories is transformed into 1 column, the +second feature with 3 values is transformed into 2 columns, and the third +feature is transformed into 3 columns. By default, the number of categories in a feature is inferred automatically from the dataset by looking for the maximum value. It is possible to specify -this explicitly using the parameter ``n_values``. - -* There are two genders, three possible continents and four web browsers in our - dataset. -* Then we fit the estimator, and transform a data point. -* In the result, the first number encodes the height, the next two numbers the - income level, and the next set of three numbers the education level. - -Note that, if there is a possibilty that the training data might have missing -categorical features, one has to explicitly set ``n_values``. For example,:: +this explicitly using the parameter ``n_values``. In particular if the +training data might have missing categorical features, one has to explicitly +set ``n_values``. For example,:: >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) >>> # Note that there are missing categorical values for the 2nd and 3rd >>> # features >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', - n_values=[2, 3, 4], ordinal_features='all', sparse=False) + n_values=[2, 3, 4], sparse=False) >>> enc.transform([[1, 1, 2]]) - array([[ 1., 1., 0., 1., 1., 0.]]) + array([[1., 1., 0., 1., 1., 0.]]) + +.. note:: + + This encoding is likely to help when used with linear models and + kernel-based models like SVMs with the standard kernels. On the other + hand, this transformation is unlikely to help when using tree-based + models, since those already work on the basis of a particular feature + value being less or bigger than a threshold. + .. _preprocessing_discretization: diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index ad72f2f560606..0ca5e1e14bccd 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -857,20 +857,23 @@ def inverse_transform(self, X): class UnaryEncoder(BaseEstimator, TransformerMixin): """Encode ordinal integer features using a unary scheme. - The input to this transformer should be a matrix of non-negative integers, - denoting the values taken on by ordinal (discrete) features. The output - will be a matrix where each column corresponds to one possible value of - one feature. It is assumed that input features take on values in the range - 0 to (n_values - 1). - - This encoding is needed for feeding ordinal features to many scikit-learn - estimators, notably linear models and kernel-based models like SVMs with - the standard kernels. + This encoder transforms each ordinal feature with ``m`` possible values + into ``m - 1`` binary features, where the ith feature is active if ``x > + i``. The input to this transformer should be a matrix of non-negative + integers, denoting the values taken on by the ordinal features. + + This encoding may be needed for feeding ordinal features to many + scikit-learn estimators, notably linear models and kernel-based models + like SVMs with the standard kernels. This transformation is unlikely to help when using with tree-based models, since those already work on the basis of a particular feature value being - < or > than a threshold, unlike linear and kernel-based models. + less or greater than a threshold, unlike linear and kernel-based models. - Read more in the :ref:`User Guide `. + This encoder encodes all of the features. To only encode a subset of the + features, use :class:`ColumnTransformer + `. + + Read more in the :ref:`User Guide `. Parameters ---------- @@ -921,7 +924,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): ... [0, 2, 1], ... [1, 0, 2]]) # doctest: +ELLIPSIS UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', - n_values='auto', ordinal_features='all', sparse=False) + n_values='auto', sparse=False) >>> enc.n_values_ array([2, 3, 4]) >>> enc.feature_indices_ @@ -935,6 +938,8 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): using a one-hot aka one-of-K scheme. sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) encoding of the categorical features. + sklearn.compose.ColumnTransformer: Applies transformers to columns of an + array. """ def __init__(self, n_values="auto", dtype=np.float64, sparse=False, handle_greater='warn'): @@ -949,9 +954,8 @@ def fit(self, X, y=None): Parameters ---------- X : array-like of shape (n_samples, n_feature) - Input array of type int. - All feature values should be non-negative otherwise will raise a - ValueError. + Input array of type int. All feature values should be + non-negative otherwise will raise a ValueError. """ X = check_array(X, dtype=np.int) if self.handle_greater not in ['warn', 'error', 'clip']: @@ -1003,7 +1007,8 @@ def transform(self, X): Returns ------- - X_tr: sparse matrix if sparse=True else a 2-d array + X_tr: array-like or sparse matrix, of shape \ + (n_samples, n_encoded_features) Transformed input. """ check_is_fitted(self, 'n_values_') From 98c0e5acb557be5084e22af4aebc7c1975307533 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 30 Dec 2018 16:38:56 -0500 Subject: [PATCH 21/34] Added whatsnew --- doc/modules/preprocessing.rst | 2 +- doc/whats_new/v0.21.rst | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 943ea946ad73d..e5106d873690d 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -556,7 +556,7 @@ Unary encoding For some ordinal features, it does not necessarily make sense to use :class:`OrdinalEncoder` if the difference between the ordered categories is -un-even, for example with a feature that takes values in "very short", +uneven, for example with a feature that takes values in "very short", "short", "big". For such features, it is possible to use a unary encoding, which is diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 13a08b9cec4b7..6fe58d95a1c89 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -179,6 +179,11 @@ Support for Python 3.4 and below has been officially dropped. in the dense case. Also added a new parameter ``order`` which controls output order for further speed performances. :issue:`12251` by `Tom Dupre la Tour`_. +- |Feature| Added a new encoder :class:`preprocessing.UnaryEncoding`, useful + for ordinal features with uneven differences between categories. + :issue:`8652` and :issue:`12893` by :user:`Arjun Jauhari ` and + :user:`Nicolas Hug `. + :mod:`sklearn.tree` ................... - |Feature| Decision Trees can now be plotted with matplotlib using From 7886fb701f7c21d649f54cadc8b312f614505247 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 30 Dec 2018 17:42:37 -0500 Subject: [PATCH 22/34] Removed from __all__ in preprocesing.data --- sklearn/preprocessing/__init__.py | 2 +- sklearn/preprocessing/data.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 49f80d01e9abe..c0057ead00b68 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -54,6 +54,7 @@ 'Normalizer', 'OneHotEncoder', 'OrdinalEncoder', + 'UnaryEncoder' 'PowerTransformer', 'RobustScaler', 'StandardScaler', @@ -68,5 +69,4 @@ 'label_binarize', 'quantile_transform', 'power_transform', - 'UnaryEncoder' ] diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 58d591c56ea1f..dea87aed1b099 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -63,7 +63,6 @@ 'minmax_scale', 'quantile_transform', 'power_transform', - 'UnaryEncoder' ] From e96b438bd22b80a858c0b5e0b5f4410bba0c4f0e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 30 Dec 2018 17:44:42 -0500 Subject: [PATCH 23/34] Fixed typo --- sklearn/preprocessing/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index c0057ead00b68..44df9d4a970dc 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -54,7 +54,7 @@ 'Normalizer', 'OneHotEncoder', 'OrdinalEncoder', - 'UnaryEncoder' + 'UnaryEncoder', 'PowerTransformer', 'RobustScaler', 'StandardScaler', From 4d690e9712edcd1abbce0f63631072d271d43b74 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 31 Dec 2018 11:32:16 -0500 Subject: [PATCH 24/34] Added @ruxandraburtica as author --- doc/whats_new/v0.21.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst index 6fe58d95a1c89..07d1406195503 100644 --- a/doc/whats_new/v0.21.rst +++ b/doc/whats_new/v0.21.rst @@ -181,8 +181,8 @@ Support for Python 3.4 and below has been officially dropped. - |Feature| Added a new encoder :class:`preprocessing.UnaryEncoding`, useful for ordinal features with uneven differences between categories. - :issue:`8652` and :issue:`12893` by :user:`Arjun Jauhari ` and - :user:`Nicolas Hug `. + :issue:`12893` by :user:`Ruxandra Burtica `, :user:`Arjun + Jauhari ` and :user:`Nicolas Hug `. :mod:`sklearn.tree` ................... From 6d4cb047ffdceb79f01bbdef334c8501728f3140 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 4 Jan 2019 09:20:44 -0500 Subject: [PATCH 25/34] renamed n_values into categories --- doc/modules/preprocessing.rst | 14 +-- sklearn/preprocessing/_encoders.py | 90 +++++++-------- sklearn/preprocessing/tests/test_encoders.py | 112 +++++++++---------- 3 files changed, 109 insertions(+), 107 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index e5106d873690d..5dd77cf3a24e8 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -566,8 +566,8 @@ ith feature is active if x > i. For example:: >>> enc = preprocessing.UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', - n_values='auto', sparse=False) + UnaryEncoder(categories='auto', dtype=<... 'numpy.float64'>, + handle_greater='warn', sparse=False) >>> enc.transform([[0, 1, 3]]) array([[0., 1., 0., 1., 1., 1.]]) @@ -577,16 +577,16 @@ feature is transformed into 3 columns. By default, the number of categories in a feature is inferred automatically from the dataset by looking for the maximum value. It is possible to specify -this explicitly using the parameter ``n_values``. In particular if the +this explicitly using the parameter ``categories``. In particular if the training data might have missing categorical features, one has to explicitly -set ``n_values``. For example,:: +set ``categories``. For example,:: - >>> enc = preprocessing.UnaryEncoder(n_values=[2, 3, 4]) + >>> enc = preprocessing.UnaryEncoder(categories=[2, 3, 4]) >>> # Note that there are missing categorical values for the 2nd and 3rd >>> # features >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', - n_values=[2, 3, 4], sparse=False) + UnaryEncoder(categories=[2, 3, 4], dtype=<... 'numpy.float64'>, + handle_greater='warn', sparse=False) >>> enc.transform([[1, 1, 2]]) array([[1., 1., 0., 1., 1., 0.]]) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 0ca5e1e14bccd..504909e6333e1 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -877,15 +877,16 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - n_values : 'auto', int or array of ints, optional (default='auto') - Number of values per feature. + categories : 'auto', int or array of ints, optional (default='auto') + Number of categories per feature. - - 'auto' : determine value range from training data. + - 'auto' : determine value range from training data by looking for + the maximum. - int : number of ordinal values per feature. - Each feature value should be in ``range(n_values)`` - - array : ``n_values[i]`` is the number of ordinal values in - ``X[:, i]``. Each feature value should be - in ``range(n_values[i])`` + Each feature value should be in ``range(categories)`` + - array : ``categories[i]`` is the number of ordinal values in + ``X[:, i]``. Each feature value should be in + ``range(categories[i])`` dtype : number type, optional (default=np.float) Desired dtype of output. @@ -895,11 +896,11 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): handle_greater : str, 'warn', 'error' or 'clip', optional (default='warn') Whether to raise an error or clip or warn if an - ordinal feature >= n_values is passed in. + ordinal feature >= n_categories is passed in. - - 'error': raise error if feature >= n_values is passed in. - - 'clip': all the feature values >= n_values are clipped to - (n_values-1) during transform. + - 'error': raise error if feature >= n_categories is passed in. + - 'clip': all the feature values >= n_categories are clipped to + (n_categories - 1) during transform. - 'warn': same as clip but with warning. Attributes @@ -908,7 +909,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Feature ``i`` in the original data is mapped to columns from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - n_values_ : array of shape (n_features,) + categories_ : array of shape (n_features,) Maximum number of values per feature. Examples @@ -923,9 +924,9 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): ... [1, 1, 0], ... [0, 2, 1], ... [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', - n_values='auto', sparse=False) - >>> enc.n_values_ + UnaryEncoder(categories='auto', dtype=<... 'numpy.float64'>, + handle_greater='warn', sparse=False) + >>> enc.categories_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) @@ -941,9 +942,9 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sklearn.compose.ColumnTransformer: Applies transformers to columns of an array. """ - def __init__(self, n_values="auto", dtype=np.float64, sparse=False, + def __init__(self, categories="auto", dtype=np.float64, sparse=False, handle_greater='warn'): - self.n_values = n_values + self.categories = categories self.dtype = dtype self.sparse = sparse self.handle_greater = handle_greater @@ -965,33 +966,34 @@ def fit(self, X, y=None): raise ValueError("X needs to contain only non-negative integers.") _, n_features = X.shape - if (isinstance(self.n_values, six.string_types) and - self.n_values == 'auto'): - n_values = np.max(X, axis=0) + 1 - elif isinstance(self.n_values, numbers.Integral): - n_values = np.empty(n_features, dtype=np.int) - n_values.fill(self.n_values) + if (isinstance(self.categories, six.string_types) and + self.categories == 'auto'): + categories = np.max(X, axis=0) + 1 + elif isinstance(self.categories, numbers.Integral): + categories = np.empty(n_features, dtype=np.int) + categories.fill(self.categories) else: try: - n_values = np.asarray(self.n_values, dtype=int) + categories = np.asarray(self.categories, dtype=int) except (ValueError, TypeError): - raise TypeError("Wrong type for parameter `n_values`. Expected" - " 'auto', int or array of ints, got %r" - % self.n_values) - if n_values.ndim < 1 or n_values.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if n_values is an array," + raise TypeError( + "Wrong type for parameter `categories`. Expected" + " 'auto', int or array of ints, got %r" % self.categories + ) + if categories.ndim < 1 or categories.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if categories is an array," " it has to be of shape (n_features,).") - self.n_values_ = n_values - n_values = np.hstack([[0], n_values - 1]) - indices = np.cumsum(n_values) + self.categories_ = categories + categories = np.hstack([[0], categories - 1]) + indices = np.cumsum(categories) self.feature_indices_ = indices - if self.n_values != 'auto' and self.handle_greater == 'error': - mask = (X >= self.n_values_).ravel() + if self.categories != 'auto' and self.handle_greater == 'error': + mask = (X >= self.categories_).ravel() if np.any(mask): raise ValueError("handle_greater='error' but found %d feature" - " values which exceeds n_values." + " values which exceeds categories." % np.count_nonzero(mask)) return self @@ -1011,7 +1013,7 @@ def transform(self, X): (n_samples, n_encoded_features) Transformed input. """ - check_is_fitted(self, 'n_values_') + check_is_fitted(self, 'categories_') X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") @@ -1023,22 +1025,22 @@ def transform(self, X): " Expected %d, got %d." % (indices.shape[0] - 1, n_features)) - # We clip those ordinal features of X that are greater than n_values_ + # We clip those ordinal features of X that are greater than categories_ # using mask if self.handle_greater is "clip". # This means, the row_indices and col_indices corresponding to the # greater ordinal feature are all filled with ones. - mask = (X >= self.n_values_).ravel() + mask = (X >= self.categories_).ravel() if np.any(mask): if self.handle_greater == 'warn': warnings.warn("Found %d feature values which exceeds " - "n_values during transform, clipping them." + "n_categories during transform, clipping them." % np.count_nonzero(mask)) elif self.handle_greater == 'error': raise ValueError("handle_greater='error' but found %d feature" - " values which exceeds n_values during " + " values which exceeds n_categories during " "transform." % np.count_nonzero(mask)) - X_ceil = np.where(mask.reshape(X.shape), self.n_values_ - 1, X) + X_ceil = np.where(mask.reshape(X.shape), self.categories_ - 1, X) column_start = np.tile(indices[:-1], n_samples) column_end = (indices[:-1] + X_ceil).ravel() column_indices = np.hstack([np.arange(s, e) for s, e @@ -1067,11 +1069,11 @@ def inverse_transform(self, X): Inverse transformed array. """ - check_is_fitted(self, 'n_values_') + check_is_fitted(self, 'categories_') X = check_array(X, accept_sparse='csr', ensure_min_features=0) n_samples, _ = X.shape - n_features = len(self.n_values_) + n_features = len(self.categories_) n_encoded_features = self.feature_indices_[-1] # validate shape of passed X @@ -1085,7 +1087,7 @@ def inverse_transform(self, X): j = 0 for i in range(n_features): - n_columns = self.n_values_[i] - 1 + n_columns = self.categories_[i] - 1 sub = X[:, j:j + n_columns] diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index f91447b71b7fd..48492e72b976e 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -680,9 +680,9 @@ def test_one_hot_encoder_warning(): def _generate_random_features_matrix(n_samples=10, n_features=3, - n_values_max=3): + n_categories_max=3): rng = np.random.RandomState(6) - X = rng.randint(n_values_max, size=(n_samples, n_features)) + X = rng.randint(n_categories_max, size=(n_samples, n_features)) return X @@ -702,14 +702,14 @@ def test_unary_encoder(): def test_unary_encoder_stack(): # multiple input features stack to same output rng = np.random.RandomState(6) - n_values = rng.randint(2, 10) + categories = rng.randint(2, 10) size = rng.randint(1, 10) n_features = rng.randint(2, 10) - encoder = UnaryEncoder(n_values, sparse=False) - X_multi = _generate_random_features_matrix(size, n_features, n_values) + encoder = UnaryEncoder(categories, sparse=False) + X_multi = _generate_random_features_matrix(size, n_features, categories) X_multi_t = encoder.fit_transform(X_multi) - assert_equal(X_multi_t.shape, (size, n_features * (n_values - 1))) + assert_equal(X_multi_t.shape, (size, n_features * (categories - 1))) expected = np.hstack([encoder.fit_transform(X_multi[:, i:(i + 1)]) for i in range(X_multi.shape[1])]) @@ -719,14 +719,14 @@ def test_unary_encoder_stack(): def test_unary_encoder_dense_sparse(): # test dense output in comparison to sparse results. rng = np.random.RandomState(6) - n_values = rng.randint(1, 10) + categories = rng.randint(1, 10) size = rng.randint(1, 10) n_features = rng.randint(2, 10) - sparse_encoder = UnaryEncoder(n_values, sparse=True) - dense_encoder = UnaryEncoder(n_values) + sparse_encoder = UnaryEncoder(categories, sparse=True) + dense_encoder = UnaryEncoder(categories) - X = _generate_random_features_matrix(size, n_features, n_values) + X = _generate_random_features_matrix(size, n_features, categories) X_trans_sparse = sparse_encoder.fit_transform(X) X_trans_dense = dense_encoder.fit_transform(X) @@ -747,19 +747,19 @@ def test_unary_encoder_handle_greater(): np.array([[1., 1., 1., 1., 1., 1.]])) # Test that encoder raises error for greater features during fit when - # n_values is explicitly set. - encoder = UnaryEncoder(handle_greater='error', n_values=[2, 3, 4]) + # categories is explicitly set. + encoder = UnaryEncoder(handle_greater='error', categories=[2, 3, 4]) assert_raises(ValueError, encoder.fit, X) - encoder = UnaryEncoder(handle_greater='error', n_values=[2, 3, 4]) + encoder = UnaryEncoder(handle_greater='error', categories=[2, 3, 4]) assert_raises(ValueError, encoder.fit_transform, X) - encoder = UnaryEncoder(handle_greater='error', n_values=[5, 2, 2]) + encoder = UnaryEncoder(handle_greater='error', categories=[5, 2, 2]) encoder.fit(y) assert_array_equal(encoder.transform(y), np.array([[1., 1., 1., 1., 1., 1.]])) - encoder = UnaryEncoder(handle_greater='error', n_values=[5, 2, 2]) + encoder = UnaryEncoder(handle_greater='error', categories=[5, 2, 2]) assert_array_equal(encoder.fit_transform(y), np.array([[1., 1., 1., 1., 1., 1.]])) @@ -770,7 +770,7 @@ def test_unary_encoder_handle_greater(): encoder.transform(y), np.array([[1., 1., 1., 0., 1., 0., 0.]])) - encoder = UnaryEncoder(handle_greater='clip', n_values=[3, 2, 2]) + encoder = UnaryEncoder(handle_greater='clip', categories=[3, 2, 2]) assert_array_equal( encoder.fit_transform(y), np.array([[1., 1., 1., 1.]])) @@ -778,21 +778,21 @@ def test_unary_encoder_handle_greater(): # Test the warn option. encoder = UnaryEncoder() encoder.fit(X) - w = ('Found 1 feature values which exceeds n_values during transform, ' + w = ('Found 1 feature values which exceeds n_categories during transform, ' 'clipping them.') y_transformed = assert_warns_message(UserWarning, w, encoder.transform, y) assert_array_equal( y_transformed, np.array([[1., 1., 1., 0., 1., 0., 0.]])) - encoder = UnaryEncoder(n_values=[3, 2, 2]) + encoder = UnaryEncoder(categories=[3, 2, 2]) y_transformed = assert_warns_message(UserWarning, w, encoder.fit_transform, y) assert_array_equal( y_transformed, np.array([[1., 1., 1., 1.]])) - encoder = UnaryEncoder(n_values=[5, 2, 2]) + encoder = UnaryEncoder(categories=[5, 2, 2]) assert_array_equal( encoder.fit_transform(y), np.array([[1., 1., 1., 1., 1., 1.]])) @@ -804,38 +804,38 @@ def test_unary_encoder_handle_greater(): def test_unary_encoder_errors(): rng = np.random.RandomState(6) - n_values = rng.randint(2, 10) + categories = rng.randint(2, 10) size = rng.randint(1, 10) n_features = rng.randint(2, 10) delta = rng.randint(1, 10) - encoder = UnaryEncoder(n_values) - X = _generate_random_features_matrix(size, n_features, n_values) + encoder = UnaryEncoder(categories) + X = _generate_random_features_matrix(size, n_features, categories) encoder.fit(X) # test that an error is raised when different shape larger_n_features = n_features + delta X_too_large = _generate_random_features_matrix(size, larger_n_features, - n_values) + categories) assert_raises(ValueError, encoder.transform, X_too_large) error_msg = ("X has different shape than during fitting." " Expected {}, got {}.".format(n_features, larger_n_features)) assert_raises_regex(ValueError, error_msg, encoder.transform, X_too_large) # test that an error is raised when out of bounds - encoder = UnaryEncoder(n_values, handle_greater='error') - X = _generate_random_features_matrix(size, n_features, n_values) + encoder = UnaryEncoder(categories, handle_greater='error') + X = _generate_random_features_matrix(size, n_features, categories) encoder.fit(X) - X[0][0] = n_values + delta + X[0][0] = categories + delta X_out_of_bounds = X assert_raises(ValueError, encoder.transform, X_out_of_bounds) error_msg = ("handle_greater='error' but found 1 feature values which " - "exceeds n_values during transform.") + "exceeds n_categories during transform.") assert_raises_regex(ValueError, error_msg, encoder.transform, X_out_of_bounds) # test exception on wrong init param - assert_raises(TypeError, UnaryEncoder(n_values=np.int).fit, X) + assert_raises(TypeError, UnaryEncoder(categories=np.int).fit, X) # test negative input to fit encoder = UnaryEncoder() @@ -867,63 +867,63 @@ def test_unary_encoder_edge_cases(): assert_array_equal(transformed, expected_matrix) -def test_unary_encoder_n_values_int(): - # Test UnaryEncoder's n_values parameter when set as an int. +def test_unary_encoder_categories_int(): + # Test UnaryEncoder's categories parameter when set as an int. rng = np.random.RandomState(6) - n_values = rng.randint(2, 10) + categories = rng.randint(2, 10) size = rng.randint(1, 10) n_features = rng.randint(2, 10) delta = rng.randint(1, 10) - encoder_n_values = n_values + delta - unary_n_values = encoder_n_values - 1 - enc = UnaryEncoder(n_values=encoder_n_values) + encoder_categories = categories + delta + unary_categories = encoder_categories - 1 + enc = UnaryEncoder(categories=encoder_categories) - X = _generate_random_features_matrix(size, n_features, n_values) + X = _generate_random_features_matrix(size, n_features, categories) X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (size, unary_n_values * n_features)) + assert_equal(X_trans.shape, (size, unary_categories * n_features)) assert_array_equal( enc.feature_indices_, - np.arange(0, (unary_n_values * n_features) + 1, unary_n_values) + np.arange(0, (unary_categories * n_features) + 1, unary_categories) ) assert_array_equal( - enc.n_values_, - np.array([encoder_n_values] * n_features) + enc.categories_, + np.array([encoder_categories] * n_features) ) -def test_unary_encoder_n_values_array(): - # Test UnaryEncoder's n_values parameter when set as an array. +def test_unary_encoder_categories_array(): + # Test UnaryEncoder's categories parameter when set as an array. rng = np.random.RandomState(6) - n_values = rng.randint(2, 10) + categories = rng.randint(2, 10) size = rng.randint(1, 10) n_features = rng.randint(2, 10) delta = rng.randint(1, 10) # Test ideal case is working fine - X = _generate_random_features_matrix(size, n_features, n_values) - n_values_array = list(np.max(X, axis=0) + 1) - enc = UnaryEncoder(n_values=n_values_array) + X = _generate_random_features_matrix(size, n_features, categories) + categories_array = list(np.max(X, axis=0) + 1) + enc = UnaryEncoder(categories=categories_array) X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (size, sum(n_values_array) - n_features)) + assert_equal(X_trans.shape, (size, sum(categories_array) - n_features)) assert_array_equal( enc.feature_indices_, - np.cumsum(np.array([1] + n_values_array) - 1) + np.cumsum(np.array([1] + categories_array) - 1) ) assert_array_equal( - enc.n_values_, - np.array(n_values_array) + enc.categories_, + np.array(categories_array) ) - # Test that fit_transform raises error when len(n_values) != n_features - n_values_array = rng.randint(2, 10, n_features + delta) - enc = UnaryEncoder(n_values=n_values_array) - X = _generate_random_features_matrix(size, n_features, n_values) + # Test that fit_transform raises error when len(categories) != n_features + categories_array = rng.randint(2, 10, n_features + delta) + enc = UnaryEncoder(categories=categories_array) + X = _generate_random_features_matrix(size, n_features, categories) assert_raises(ValueError, enc.fit_transform, X) - # Test that fit_transform raises error when len(n_values) != n_features - enc = UnaryEncoder(n_values=[]) - X = _generate_random_features_matrix(size, n_features, n_values) + # Test that fit_transform raises error when len(categories) != n_features + enc = UnaryEncoder(categories=[]) + X = _generate_random_features_matrix(size, n_features, categories) assert_raises(ValueError, enc.fit_transform, X) From d5ec0047ff9717b0147f3a26f5fa3a977bef8964 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 4 Jan 2019 09:48:40 -0500 Subject: [PATCH 26/34] Added example for combining with OrdinalEncoder in user guide --- doc/modules/preprocessing.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 5dd77cf3a24e8..64285f79e85f0 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -598,6 +598,19 @@ set ``categories``. For example,:: models, since those already work on the basis of a particular feature value being less or bigger than a threshold. +It is possible to combine :class:`UnaryEncoder` and :class:`OrdinalEncoder` +into a :class:`Pipeline ` like so:: + + >>> from sklearn.pipeline import make_pipeline + >>> from sklearn.preprocessing import OrdinalEncoder, UnaryEncoder + >>> categories = [['small', 'medium', 'huge']] + >>> pipeline = make_pipeline(OrdinalEncoder(categories), UnaryEncoder()) + >>> X = [['small'], ['medium'], ['huge']] + >>> pipeline.fit_transform(X) + array([[0., 0.], + [1., 0.], + [1., 1.]]) + .. _preprocessing_discretization: From e1f2f3fbda64a6c14b8082357c9103897a6665d4 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Fri, 4 Jan 2019 09:51:31 -0500 Subject: [PATCH 27/34] Removed six usage --- sklearn/preprocessing/_encoders.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index d80d70de58c9a..d4d0ba22db077 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -964,8 +964,7 @@ def fit(self, X, y=None): raise ValueError("X needs to contain only non-negative integers.") _, n_features = X.shape - if (isinstance(self.categories, six.string_types) and - self.categories == 'auto'): + if isinstance(self.categories, str) and self.categories == 'auto': categories = np.max(X, axis=0) + 1 elif isinstance(self.categories, numbers.Integral): categories = np.empty(n_features, dtype=np.int) From 54861d2f7703449bd59bd4013f901ffdd1d117e6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 7 Jan 2019 09:40:51 -0500 Subject: [PATCH 28/34] Addressed most comments from Joel --- sklearn/preprocessing/_encoders.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index d4d0ba22db077..5bf9b1a1171cb 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -263,7 +263,7 @@ class OneHotEncoder(_BaseEncoder): -------- sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) encoding of the categorical features. - sklearn.preprocessing.UnaryEncoder: performs a unary encoding of ordinal + sklearn.preprocessing.UnaryEncoder : performs a unary encoding of ordinal data. sklearn.feature_extraction.DictVectorizer : performs a one-hot encoding of dictionary items (also handles string-valued features). @@ -768,7 +768,7 @@ class OrdinalEncoder(_BaseEncoder): -------- sklearn.preprocessing.OneHotEncoder : performs a one-hot encoding of categorical features. - sklearn.preprocessing.UnaryEncoder: performs a unary encoding of ordinal + sklearn.preprocessing.UnaryEncoder : performs a unary encoding of ordinal data. sklearn.preprocessing.LabelEncoder : encodes target labels with values between 0 and n_classes-1. @@ -913,8 +913,8 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Examples -------- Given a dataset with three features and four samples, we let the encoder - find the maximum value per feature and transform the data to a binary - unary encoding. + find the maximum value per feature and transform the data to a unary + encoding. >>> from sklearn.preprocessing import UnaryEncoder >>> enc = UnaryEncoder() @@ -933,12 +933,10 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): See also -------- - sklearn.preprocessing.OneHotEncoder: encodes categorical integer features + sklearn.preprocessing.OneHotEncoder : encodes categorical integer features using a one-hot aka one-of-K scheme. sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) encoding of the categorical features. - sklearn.compose.ColumnTransformer: Applies transformers to columns of an - array. """ def __init__(self, categories="auto", dtype=np.float64, sparse=False, handle_greater='warn'): @@ -959,7 +957,7 @@ def fit(self, X, y=None): X = check_array(X, dtype=np.int) if self.handle_greater not in ['warn', 'error', 'clip']: raise ValueError("handle_greater should be either 'warn', 'error' " - "or 'clip' got %s" % self.handle_greater) + "or 'clip'. got %s" % self.handle_greater) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") _, n_features = X.shape @@ -1082,15 +1080,17 @@ def inverse_transform(self, X): # return float dtype, even though it will contain int values X_tr = np.zeros((n_samples, n_features), dtype=np.float) - j = 0 - for i in range(n_features): - n_columns = self.categories_[i] - 1 + for feature_idx, (start, stop) in enumerate(zip( + self.feature_indices_, + self.feature_indices_[1:])): - sub = X[:, j:j + n_columns] + # sub = portion of the tranformed matrix that correspond to the + # current feature + sub = X[:, start:stop] + # the original category is the sum of the (binary) columns, or + # equivalently the position of the first 0. categories = sub.sum(axis=1).ravel() - X_tr[:, i] = categories - - j += n_columns + X_tr[:, feature_idx] = categories return X_tr From 192e042236618a592568e3c6d697bfa68de60179 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 7 Jan 2019 09:47:24 -0500 Subject: [PATCH 29/34] typos --- sklearn/preprocessing/_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 5bf9b1a1171cb..9f8fc3a71a26d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1084,7 +1084,7 @@ def inverse_transform(self, X): self.feature_indices_, self.feature_indices_[1:])): - # sub = portion of the tranformed matrix that correspond to the + # sub = portion of the transformed matrix that corresponds to the # current feature sub = X[:, start:stop] From 57538f04d7899ecc1f3e2871f9126b1cc8ccd720 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 8 Jan 2019 09:25:05 -0500 Subject: [PATCH 30/34] inverse transform now accepts non-binary input --- sklearn/preprocessing/_encoders.py | 5 ++--- sklearn/preprocessing/tests/test_encoders.py | 9 +++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 9f8fc3a71a26d..5dbdc43105faa 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1088,9 +1088,8 @@ def inverse_transform(self, X): # current feature sub = X[:, start:stop] - # the original category is the sum of the (binary) columns, or - # equivalently the position of the first 0. - categories = sub.sum(axis=1).ravel() + # the original category is the number or non-zero columns + categories = (sub != 0).sum(axis=1).ravel() X_tr[:, feature_idx] = categories return X_tr diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 48492e72b976e..39241876be3a7 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -950,3 +950,12 @@ def test_unary_encoder_inverse_transform_input(): "Shape of the passed X data is not correct. Expected 3 columns, got 4", enc.inverse_transform, bad_X_tr ) + + # Also check that inverse_transform still works on non-binary matrices. + # Non-zero values are treated as ones. + X_inv = enc.inverse_transform([[4, 100, 0]]) # Treated as [1, 1, 0]) + assert_array_equal(X_inv, [[1, 1]]) + X_inv = enc.inverse_transform([[4, 100, 123]]) # Treated as [1, 1, 1]) + assert_array_equal(X_inv, [[1, 2]]) + X_inv = enc.inverse_transform([[0, 1, 123]]) # Treated as [0, 1, 1]) + assert_array_equal(X_inv, [[0, 2]]) \ No newline at end of file From 7af673b442b0c34b9b1d062da0f2958f7b8c3525 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 8 Jan 2019 09:27:32 -0500 Subject: [PATCH 31/34] newline EOF --- sklearn/preprocessing/tests/test_encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 39241876be3a7..5ef26055cbcf1 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -958,4 +958,4 @@ def test_unary_encoder_inverse_transform_input(): X_inv = enc.inverse_transform([[4, 100, 123]]) # Treated as [1, 1, 1]) assert_array_equal(X_inv, [[1, 2]]) X_inv = enc.inverse_transform([[0, 1, 123]]) # Treated as [0, 1, 1]) - assert_array_equal(X_inv, [[0, 2]]) \ No newline at end of file + assert_array_equal(X_inv, [[0, 2]]) From e2c90dd340da722b14e43512d3e8bced2aa669b9 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Thu, 10 Jan 2019 09:27:33 -0500 Subject: [PATCH 32/34] Added section in OrdinalEncoder user guide for specifying categories --- doc/modules/preprocessing.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 64285f79e85f0..a51803d963fc0 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -475,6 +475,14 @@ new feature of integers (0 to n_categories - 1):: >>> enc.transform([['female', 'from US', 'uses Safari']]) array([[0., 1., 1.]]) +You can specify the order of the categories by passing the ``categories`` +attribute:: + >>> enc = preprocessing.OrdinalEncoder(categories=[['big', 'small'], + ... ['short', 'tall']]) + >>> X = [['big', 'tall']] + >>> enc.fit_transform(X) # doctest: +ELLIPSIS + array([[0., 1.]]) + Such integer representation can, however, not be used directly with all scikit-learn estimators, as these expect continuous input, and would interpret the categories as being ordered, which is often not desired (i.e. the set of From 5d69bf9dfed17f83a07aa2c22ab014c23878673a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 14 Jan 2019 10:38:49 -0500 Subject: [PATCH 33/34] changed categories param to max_value --- doc/modules/preprocessing.rst | 14 +-- sklearn/preprocessing/_encoders.py | 76 ++++++++--------- sklearn/preprocessing/tests/test_encoders.py | 90 ++++++++++---------- 3 files changed, 90 insertions(+), 90 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index a51803d963fc0..820721cf12eb3 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -574,8 +574,8 @@ ith feature is active if x > i. For example:: >>> enc = preprocessing.UnaryEncoder() >>> enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(categories='auto', dtype=<... 'numpy.float64'>, - handle_greater='warn', sparse=False) + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + max_value='auto', sparse=False) >>> enc.transform([[0, 1, 3]]) array([[0., 1., 0., 1., 1., 1.]]) @@ -585,16 +585,16 @@ feature is transformed into 3 columns. By default, the number of categories in a feature is inferred automatically from the dataset by looking for the maximum value. It is possible to specify -this explicitly using the parameter ``categories``. In particular if the +this explicitly using the parameter ``max_value``. In particular if the training data might have missing categorical features, one has to explicitly -set ``categories``. For example,:: +set ``max_value``. For example,:: - >>> enc = preprocessing.UnaryEncoder(categories=[2, 3, 4]) + >>> enc = preprocessing.UnaryEncoder(max_value=[2, 3, 4]) >>> # Note that there are missing categorical values for the 2nd and 3rd >>> # features >>> enc.fit([[1, 2, 3], [0, 2, 0]]) # doctest: +ELLIPSIS - UnaryEncoder(categories=[2, 3, 4], dtype=<... 'numpy.float64'>, - handle_greater='warn', sparse=False) + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + max_value=[2, 3, 4], sparse=False) >>> enc.transform([[1, 1, 2]]) array([[1., 1., 0., 1., 1., 0.]]) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 5dbdc43105faa..e2dc69560c23d 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -875,16 +875,16 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Parameters ---------- - categories : 'auto', int or array of ints, optional (default='auto') + max_value : 'auto', int or array of ints, optional (default='auto') Number of categories per feature. - 'auto' : determine value range from training data by looking for the maximum. - int : number of ordinal values per feature. - Each feature value should be in ``range(categories)`` - - array : ``categories[i]`` is the number of ordinal values in + Each feature value should be in ``range(max_value)`` + - array : ``max_value[i]`` is the number of ordinal values in ``X[:, i]``. Each feature value should be in - ``range(categories[i])`` + ``range(max_value[i])`` dtype : number type, optional (default=np.float) Desired dtype of output. @@ -894,11 +894,11 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): handle_greater : str, 'warn', 'error' or 'clip', optional (default='warn') Whether to raise an error or clip or warn if an - ordinal feature >= n_categories is passed in. + ordinal feature >= max_value is passed in. - - 'error': raise error if feature >= n_categories is passed in. - - 'clip': all the feature values >= n_categories are clipped to - (n_categories - 1) during transform. + - 'error': raise error if feature >= max_value is passed in. + - 'clip': all the feature values >= max_value are clipped to + (max_value - 1) during transform. - 'warn': same as clip but with warning. Attributes @@ -907,7 +907,7 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): Feature ``i`` in the original data is mapped to columns from ``feature_indices_[i]`` to ``feature_indices_[i+1]`` - categories_ : array of shape (n_features,) + max_value_ : array of shape (n_features,) Maximum number of values per feature. Examples @@ -922,9 +922,9 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): ... [1, 1, 0], ... [0, 2, 1], ... [1, 0, 2]]) # doctest: +ELLIPSIS - UnaryEncoder(categories='auto', dtype=<... 'numpy.float64'>, - handle_greater='warn', sparse=False) - >>> enc.categories_ + UnaryEncoder(dtype=<... 'numpy.float64'>, handle_greater='warn', + max_value='auto', sparse=False) + >>> enc.max_value_ array([2, 3, 4]) >>> enc.feature_indices_ array([0, 1, 3, 6]) @@ -938,9 +938,9 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): sklearn.preprocessing.OrdinalEncoder : performs an ordinal (integer) encoding of the categorical features. """ - def __init__(self, categories="auto", dtype=np.float64, sparse=False, + def __init__(self, max_value="auto", dtype=np.float64, sparse=False, handle_greater='warn'): - self.categories = categories + self.max_value = max_value self.dtype = dtype self.sparse = sparse self.handle_greater = handle_greater @@ -962,33 +962,33 @@ def fit(self, X, y=None): raise ValueError("X needs to contain only non-negative integers.") _, n_features = X.shape - if isinstance(self.categories, str) and self.categories == 'auto': - categories = np.max(X, axis=0) + 1 - elif isinstance(self.categories, numbers.Integral): - categories = np.empty(n_features, dtype=np.int) - categories.fill(self.categories) + if isinstance(self.max_value, str) and self.max_value == 'auto': + max_value = np.max(X, axis=0) + 1 + elif isinstance(self.max_value, numbers.Integral): + max_value = np.empty(n_features, dtype=np.int) + max_value.fill(self.max_value) else: try: - categories = np.asarray(self.categories, dtype=int) + max_value = np.asarray(self.max_value, dtype=int) except (ValueError, TypeError): raise TypeError( - "Wrong type for parameter `categories`. Expected" - " 'auto', int or array of ints, got %r" % self.categories + "Wrong type for parameter `max_value`. Expected" + " 'auto', int or array of ints, got %r" % self.max_value ) - if categories.ndim < 1 or categories.shape[0] != X.shape[1]: - raise ValueError("Shape mismatch: if categories is an array," + if max_value.ndim < 1 or max_value.shape[0] != X.shape[1]: + raise ValueError("Shape mismatch: if max_value is an array," " it has to be of shape (n_features,).") - self.categories_ = categories - categories = np.hstack([[0], categories - 1]) - indices = np.cumsum(categories) + self.max_value_ = max_value + max_value = np.hstack([[0], max_value - 1]) + indices = np.cumsum(max_value) self.feature_indices_ = indices - if self.categories != 'auto' and self.handle_greater == 'error': - mask = (X >= self.categories_).ravel() + if self.max_value != 'auto' and self.handle_greater == 'error': + mask = (X >= self.max_value_).ravel() if np.any(mask): raise ValueError("handle_greater='error' but found %d feature" - " values which exceeds categories." + " values which exceeds max_value." % np.count_nonzero(mask)) return self @@ -1008,7 +1008,7 @@ def transform(self, X): (n_samples, n_encoded_features) Transformed input. """ - check_is_fitted(self, 'categories_') + check_is_fitted(self, 'max_value_') X = check_array(X, dtype=np.int) if np.any(X < 0): raise ValueError("X needs to contain only non-negative integers.") @@ -1020,22 +1020,22 @@ def transform(self, X): " Expected %d, got %d." % (indices.shape[0] - 1, n_features)) - # We clip those ordinal features of X that are greater than categories_ + # We clip those ordinal features of X that are greater than max_value_ # using mask if self.handle_greater is "clip". # This means, the row_indices and col_indices corresponding to the # greater ordinal feature are all filled with ones. - mask = (X >= self.categories_).ravel() + mask = (X >= self.max_value_).ravel() if np.any(mask): if self.handle_greater == 'warn': warnings.warn("Found %d feature values which exceeds " - "n_categories during transform, clipping them." + "max_value during transform, clipping them." % np.count_nonzero(mask)) elif self.handle_greater == 'error': raise ValueError("handle_greater='error' but found %d feature" - " values which exceeds n_categories during " + " values which exceeds max_value during " "transform." % np.count_nonzero(mask)) - X_ceil = np.where(mask.reshape(X.shape), self.categories_ - 1, X) + X_ceil = np.where(mask.reshape(X.shape), self.max_value_ - 1, X) column_start = np.tile(indices[:-1], n_samples) column_end = (indices[:-1] + X_ceil).ravel() column_indices = np.hstack([np.arange(s, e) for s, e @@ -1064,11 +1064,11 @@ def inverse_transform(self, X): Inverse transformed array. """ - check_is_fitted(self, 'categories_') + check_is_fitted(self, 'max_value_') X = check_array(X, accept_sparse='csr', ensure_min_features=0) n_samples, _ = X.shape - n_features = len(self.categories_) + n_features = len(self.max_value_) n_encoded_features = self.feature_indices_[-1] # validate shape of passed X diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index 5ef26055cbcf1..59fd1e47a80e5 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -748,18 +748,18 @@ def test_unary_encoder_handle_greater(): # Test that encoder raises error for greater features during fit when # categories is explicitly set. - encoder = UnaryEncoder(handle_greater='error', categories=[2, 3, 4]) + encoder = UnaryEncoder(handle_greater='error', max_value=[2, 3, 4]) assert_raises(ValueError, encoder.fit, X) - encoder = UnaryEncoder(handle_greater='error', categories=[2, 3, 4]) + encoder = UnaryEncoder(handle_greater='error', max_value=[2, 3, 4]) assert_raises(ValueError, encoder.fit_transform, X) - encoder = UnaryEncoder(handle_greater='error', categories=[5, 2, 2]) + encoder = UnaryEncoder(handle_greater='error', max_value=[5, 2, 2]) encoder.fit(y) assert_array_equal(encoder.transform(y), np.array([[1., 1., 1., 1., 1., 1.]])) - encoder = UnaryEncoder(handle_greater='error', categories=[5, 2, 2]) + encoder = UnaryEncoder(handle_greater='error', max_value=[5, 2, 2]) assert_array_equal(encoder.fit_transform(y), np.array([[1., 1., 1., 1., 1., 1.]])) @@ -770,7 +770,7 @@ def test_unary_encoder_handle_greater(): encoder.transform(y), np.array([[1., 1., 1., 0., 1., 0., 0.]])) - encoder = UnaryEncoder(handle_greater='clip', categories=[3, 2, 2]) + encoder = UnaryEncoder(handle_greater='clip', max_value=[3, 2, 2]) assert_array_equal( encoder.fit_transform(y), np.array([[1., 1., 1., 1.]])) @@ -778,21 +778,21 @@ def test_unary_encoder_handle_greater(): # Test the warn option. encoder = UnaryEncoder() encoder.fit(X) - w = ('Found 1 feature values which exceeds n_categories during transform, ' + w = ('Found 1 feature values which exceeds max_value during transform, ' 'clipping them.') y_transformed = assert_warns_message(UserWarning, w, encoder.transform, y) assert_array_equal( y_transformed, np.array([[1., 1., 1., 0., 1., 0., 0.]])) - encoder = UnaryEncoder(categories=[3, 2, 2]) + encoder = UnaryEncoder(max_value=[3, 2, 2]) y_transformed = assert_warns_message(UserWarning, w, encoder.fit_transform, y) assert_array_equal( y_transformed, np.array([[1., 1., 1., 1.]])) - encoder = UnaryEncoder(categories=[5, 2, 2]) + encoder = UnaryEncoder(max_value=[5, 2, 2]) assert_array_equal( encoder.fit_transform(y), np.array([[1., 1., 1., 1., 1., 1.]])) @@ -804,38 +804,38 @@ def test_unary_encoder_handle_greater(): def test_unary_encoder_errors(): rng = np.random.RandomState(6) - categories = rng.randint(2, 10) + max_value = rng.randint(2, 10) size = rng.randint(1, 10) n_features = rng.randint(2, 10) delta = rng.randint(1, 10) - encoder = UnaryEncoder(categories) - X = _generate_random_features_matrix(size, n_features, categories) + encoder = UnaryEncoder(max_value) + X = _generate_random_features_matrix(size, n_features, max_value) encoder.fit(X) # test that an error is raised when different shape larger_n_features = n_features + delta X_too_large = _generate_random_features_matrix(size, larger_n_features, - categories) + max_value) assert_raises(ValueError, encoder.transform, X_too_large) error_msg = ("X has different shape than during fitting." " Expected {}, got {}.".format(n_features, larger_n_features)) assert_raises_regex(ValueError, error_msg, encoder.transform, X_too_large) # test that an error is raised when out of bounds - encoder = UnaryEncoder(categories, handle_greater='error') - X = _generate_random_features_matrix(size, n_features, categories) + encoder = UnaryEncoder(max_value, handle_greater='error') + X = _generate_random_features_matrix(size, n_features, max_value) encoder.fit(X) - X[0][0] = categories + delta + X[0][0] = max_value + delta X_out_of_bounds = X assert_raises(ValueError, encoder.transform, X_out_of_bounds) error_msg = ("handle_greater='error' but found 1 feature values which " - "exceeds n_categories during transform.") + "exceeds max_value during transform.") assert_raises_regex(ValueError, error_msg, encoder.transform, X_out_of_bounds) # test exception on wrong init param - assert_raises(TypeError, UnaryEncoder(categories=np.int).fit, X) + assert_raises(TypeError, UnaryEncoder(max_value=np.int).fit, X) # test negative input to fit encoder = UnaryEncoder() @@ -867,63 +867,63 @@ def test_unary_encoder_edge_cases(): assert_array_equal(transformed, expected_matrix) -def test_unary_encoder_categories_int(): - # Test UnaryEncoder's categories parameter when set as an int. +def test_unary_encoder_max_value_int(): + # Test UnaryEncoder's max_value parameter when set as an int. rng = np.random.RandomState(6) - categories = rng.randint(2, 10) + max_value = rng.randint(2, 10) size = rng.randint(1, 10) n_features = rng.randint(2, 10) delta = rng.randint(1, 10) - encoder_categories = categories + delta - unary_categories = encoder_categories - 1 - enc = UnaryEncoder(categories=encoder_categories) + encoder_max_value = max_value + delta + unary_max_value = encoder_max_value - 1 + enc = UnaryEncoder(max_value=encoder_max_value) - X = _generate_random_features_matrix(size, n_features, categories) + X = _generate_random_features_matrix(size, n_features, max_value) X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (size, unary_categories * n_features)) + assert_equal(X_trans.shape, (size, unary_max_value * n_features)) assert_array_equal( enc.feature_indices_, - np.arange(0, (unary_categories * n_features) + 1, unary_categories) + np.arange(0, (unary_max_value * n_features) + 1, unary_max_value) ) assert_array_equal( - enc.categories_, - np.array([encoder_categories] * n_features) + enc.max_value_, + np.array([encoder_max_value] * n_features) ) -def test_unary_encoder_categories_array(): - # Test UnaryEncoder's categories parameter when set as an array. +def test_unary_encoder_max_value_array(): + # Test UnaryEncoder's max_value parameter when set as an array. rng = np.random.RandomState(6) - categories = rng.randint(2, 10) + max_value = rng.randint(2, 10) size = rng.randint(1, 10) n_features = rng.randint(2, 10) delta = rng.randint(1, 10) # Test ideal case is working fine - X = _generate_random_features_matrix(size, n_features, categories) - categories_array = list(np.max(X, axis=0) + 1) - enc = UnaryEncoder(categories=categories_array) + X = _generate_random_features_matrix(size, n_features, max_value) + max_value_array = list(np.max(X, axis=0) + 1) + enc = UnaryEncoder(max_value=max_value_array) X_trans = enc.fit_transform(X) - assert_equal(X_trans.shape, (size, sum(categories_array) - n_features)) + assert_equal(X_trans.shape, (size, sum(max_value_array) - n_features)) assert_array_equal( enc.feature_indices_, - np.cumsum(np.array([1] + categories_array) - 1) + np.cumsum(np.array([1] + max_value_array) - 1) ) assert_array_equal( - enc.categories_, - np.array(categories_array) + enc.max_value_, + np.array(max_value_array) ) - # Test that fit_transform raises error when len(categories) != n_features - categories_array = rng.randint(2, 10, n_features + delta) - enc = UnaryEncoder(categories=categories_array) - X = _generate_random_features_matrix(size, n_features, categories) + # Test that fit_transform raises error when len(max_value) != n_features + max_value_array = rng.randint(2, 10, n_features + delta) + enc = UnaryEncoder(max_value=max_value_array) + X = _generate_random_features_matrix(size, n_features, max_value) assert_raises(ValueError, enc.fit_transform, X) - # Test that fit_transform raises error when len(categories) != n_features - enc = UnaryEncoder(categories=[]) - X = _generate_random_features_matrix(size, n_features, categories) + # Test that fit_transform raises error when len(max_value) != n_features + enc = UnaryEncoder(max_value=[]) + X = _generate_random_features_matrix(size, n_features, max_value) assert_raises(ValueError, enc.fit_transform, X) From 5350ba8b08131699d1be2040d4f07048abea795a Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 15 Jan 2019 08:27:58 -0500 Subject: [PATCH 34/34] Addressed comments --- doc/modules/preprocessing.rst | 6 ++++-- sklearn/preprocessing/_encoders.py | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 820721cf12eb3..d228ddb1cf58d 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -606,8 +606,10 @@ set ``max_value``. For example,:: models, since those already work on the basis of a particular feature value being less or bigger than a threshold. -It is possible to combine :class:`UnaryEncoder` and :class:`OrdinalEncoder` -into a :class:`Pipeline ` like so:: +In case the input variable is not represented as a number from 0 to +``max_value``, it is possible to combine :class:`UnaryEncoder` and +:class:`OrdinalEncoder` into a :class:`Pipeline ` +like so:: >>> from sklearn.pipeline import make_pipeline >>> from sklearn.preprocessing import OrdinalEncoder, UnaryEncoder diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index e2dc69560c23d..694fa7b0d6e8b 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -881,10 +881,9 @@ class UnaryEncoder(BaseEstimator, TransformerMixin): - 'auto' : determine value range from training data by looking for the maximum. - int : number of ordinal values per feature. - Each feature value should be in ``range(max_value)`` + Each feature value should be in [0, max_value]. - array : ``max_value[i]`` is the number of ordinal values in - ``X[:, i]``. Each feature value should be in - ``range(max_value[i])`` + ``X[:, i]``. Each feature value should be in [0, max_value[i]]. dtype : number type, optional (default=np.float) Desired dtype of output.