From 9dda91997aced54a069d3f0936f7d09433ba7dd7 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 6 May 2019 12:03:00 -0400 Subject: [PATCH 1/9] WIP --- sklearn/preprocessing/_encoders.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 6d11e7907984a..474e2ab4c2e01 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -85,6 +85,7 @@ def _fit(self, X, handle_unknown='error'): " it has to be of shape (n_features,).") self.categories_ = [] + self.unfrequent_= [] for i in range(n_features): Xi = X_list[i] @@ -103,6 +104,9 @@ def _fit(self, X, handle_unknown='error'): " during fit".format(diff, i)) raise ValueError(msg) self.categories_.append(cats) + self.unfrequent_.append(self._find_unfrequent_categories(Xi)) + + def _find_unfrequent_categories(self): def _transform(self, X, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) @@ -317,7 +321,7 @@ class OneHotEncoder(_BaseEncoder): def __init__(self, n_values=None, categorical_features=None, categories=None, drop=None, sparse=True, dtype=np.float64, - handle_unknown='error'): + handle_unknown='error', max_levels=None): self.categories = categories self.sparse = sparse self.dtype = dtype @@ -325,6 +329,7 @@ def __init__(self, n_values=None, categorical_features=None, self.n_values = n_values self.categorical_features = categorical_features self.drop = drop + self.max_levels = max_levels # Deprecated attributes From 758191fe2d8d59c6febfd6c487fab995c7769e12 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Mon, 6 May 2019 14:25:47 -0400 Subject: [PATCH 2/9] WIP --- sklearn/preprocessing/_encoders.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 474e2ab4c2e01..212e7b8cf8ceb 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -106,7 +106,10 @@ def _fit(self, X, handle_unknown='error'): self.categories_.append(cats) self.unfrequent_.append(self._find_unfrequent_categories(Xi)) - def _find_unfrequent_categories(self): + def _find_unfrequent_categories(self, Xi): + unique, counts= np.unique(Xi, return_counts=True) + indices = np.argsort(counts)[-self.max_levels:] + return unique[indices] def _transform(self, X, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) From 4cff102668d67c6094b6e537d104f59aeb947b62 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 7 May 2019 16:59:21 -0400 Subject: [PATCH 3/9] WIP --- sklearn/preprocessing/_encoders.py | 36 +++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 212e7b8cf8ceb..568097b43b6ba 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -85,7 +85,9 @@ def _fit(self, X, handle_unknown='error'): " it has to be of shape (n_features,).") self.categories_ = [] - self.unfrequent_= [] + self.infrequent_= [] + self._is_infrequent = [] + self.infrequent_indices_ = [] for i in range(n_features): Xi = X_list[i] @@ -104,12 +106,14 @@ def _fit(self, X, handle_unknown='error'): " during fit".format(diff, i)) raise ValueError(msg) self.categories_.append(cats) - self.unfrequent_.append(self._find_unfrequent_categories(Xi)) + inf, indices = self._find_infrequent_categories(Xi) + self.infrequent_.append(inf) + self.infrequent_indices_.append(indices) - def _find_unfrequent_categories(self, Xi): - unique, counts= np.unique(Xi, return_counts=True) - indices = np.argsort(counts)[-self.max_levels:] - return unique[indices] + def _find_infrequent_categories(self, Xi): + unique, counts = np.unique(Xi, return_counts=True) + indices = np.argsort(counts)[:-self.max_levels] + return unique[indices], indices def _transform(self, X, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) @@ -685,6 +689,18 @@ def _transform_new(self, X): """New implementation assuming categorical input""" # validation of X happens in _check_X called by _transform X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) + hello = [] + for feature_idx in range(X_int.shape[1]): + col = X_int[:, feature_idx] + if self.infrequent_[feature_idx].size > 0: + mapping = np.arange(len(self.categories_[feature_idx])) + for i in self.infrequent_indices_[feature_idx]: + mapping[i] = np.iinfo(col.dtype).max + + from .label import _encode_numpy + _, encoded_mapping = _encode_numpy(mapping, encode=True) + col[:] = encoded_mapping[col] + hello.append(encoded_mapping) n_samples, n_features = X_int.shape @@ -694,12 +710,16 @@ def _transform_new(self, X): # We remove all the dropped categories from mask, and decrement all # categories that occur after them to avoid an empty column. + if not isinstance(self.drop, str): # drop is not 'first' + for i in range(to_drop.shape[1]): + to_drop[0][i] = hello[i][to_drop[0][i]] + keep_cells = X_int != to_drop X_mask &= keep_cells X_int[X_int > to_drop] -= 1 - n_values = [len(cats) - 1 for cats in self.categories_] + n_values = [len(cats) - len(inf) for (cats, inf) in zip(self.categories_, self.infrequent_)] else: - n_values = [len(cats) for cats in self.categories_] + n_values = [len(cats) - len(inf) + 1 for (cats, inf) in zip(self.categories_, self.infrequent_)] mask = X_mask.ravel() n_values = np.array([0] + n_values) From d2a1a06bae3f09be489efc73f0e5008be4b0c47b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 8 May 2019 11:45:59 -0400 Subject: [PATCH 4/9] some tests --- sklearn/preprocessing/_encoders.py | 108 ++++++++++++------- sklearn/preprocessing/tests/test_encoders.py | 92 ++++++++++++++++ 2 files changed, 164 insertions(+), 36 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 568097b43b6ba..17232c8707cc6 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -16,7 +16,7 @@ from ..utils.validation import check_is_fitted from .base import _transform_selected -from .label import _encode, _encode_check_unknown +from .label import _encode, _encode_check_unknown, _encode_numpy __all__ = [ @@ -85,8 +85,6 @@ def _fit(self, X, handle_unknown='error'): " it has to be of shape (n_features,).") self.categories_ = [] - self.infrequent_= [] - self._is_infrequent = [] self.infrequent_indices_ = [] for i in range(n_features): @@ -106,14 +104,18 @@ def _fit(self, X, handle_unknown='error'): " during fit".format(diff, i)) raise ValueError(msg) self.categories_.append(cats) - inf, indices = self._find_infrequent_categories(Xi) - self.infrequent_.append(inf) - self.infrequent_indices_.append(indices) - def _find_infrequent_categories(self, Xi): - unique, counts = np.unique(Xi, return_counts=True) - indices = np.argsort(counts)[:-self.max_levels] - return unique[indices], indices + if self.max_levels is not None: + infrequent_indices = self._find_infrequent_category_indices(Xi) + else: + infrequent_indices = np.array([]) + self.infrequent_indices_.append(infrequent_indices) + + def _find_infrequent_category_indices(self, Xi): + # TODO: this is using unique on X again. Ideally we should integrate + # this into _encode() + _, counts = np.unique(Xi, return_counts=True) + return np.argsort(counts)[:-self.max_levels] def _transform(self, X, handle_unknown='error'): X_list, n_samples, n_features = self._check_X(X) @@ -148,6 +150,29 @@ def _transform(self, X, handle_unknown='error'): _, encoded = _encode(Xi, self.categories_[i], encode=True) X_int[:, i] = encoded + # We need to take care of infrequent categories here. We want all the + # infrequent categories to end up in a specific column, after all the + # frequent ones. Let's say we have 4 categories with 2 infrequent + # categories (and 2 frequent categories): we want the value in X_int + # for the infrequent categories to be 2 (third column), and the values + # for the frequent ones to be 0 and 1. The piece of code below + # performs this mapping. + # TODO: maybe integrate this part with the one above + self._infrequent_mappings = {} + huge_int = np.iinfo(X_int.dtype).max + for feature_idx in range(n_features): + if self.infrequent_indices_[feature_idx].size > 0: + mapping = np.arange(len(self.categories_[feature_idx])) + # Trick: set the infrequent cats columns to a very big int and + # encode again. + for ordinal_cat in self.infrequent_indices_[feature_idx]: + mapping[ordinal_cat] = huge_int + _, mapping = _encode_numpy(mapping, encode=True) + + # update X_int and save mapping for later (for dropping logic) + X_int[:, feature_idx] = mapping[X_int[:, feature_idx]] + self._infrequent_mappings[feature_idx] = mapping + return X_int, X_mask @@ -254,6 +279,10 @@ class OneHotEncoder(_BaseEncoder): be dropped for each feature. None if all the transformed features will be retained. + infrequent_indices_: list of arrays of shape(n_infrequent_categories) + ``infrequent_indices_[i]`` contains a list of indices in + ``categories_[i]`` corresponsing to the infrequent categories. + active_features_ : array Indices for active features, meaning values that actually occur in the training set. Only available when n_values is ``'auto'``. @@ -292,7 +321,7 @@ class OneHotEncoder(_BaseEncoder): ... # doctest: +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features=None, categories=None, drop=None, dtype=<... 'numpy.float64'>, handle_unknown='ignore', - n_values=None, sparse=True) + max_levels=None, n_values=None, sparse=True) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] @@ -689,40 +718,40 @@ def _transform_new(self, X): """New implementation assuming categorical input""" # validation of X happens in _check_X called by _transform X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) - hello = [] - for feature_idx in range(X_int.shape[1]): - col = X_int[:, feature_idx] - if self.infrequent_[feature_idx].size > 0: - mapping = np.arange(len(self.categories_[feature_idx])) - for i in self.infrequent_indices_[feature_idx]: - mapping[i] = np.iinfo(col.dtype).max - - from .label import _encode_numpy - _, encoded_mapping = _encode_numpy(mapping, encode=True) - col[:] = encoded_mapping[col] - hello.append(encoded_mapping) - n_samples, n_features = X_int.shape if self.drop is not None: - to_drop = self.drop_idx_.reshape(1, -1) + to_drop = self.drop_idx_.copy() - # We remove all the dropped categories from mask, and decrement all - # categories that occur after them to avoid an empty column. + if not isinstance(self.drop, str): + # if drop is not 'first', we need to remap the dropped indexes + # if some of the categories are infrequent. + for feature_idx in range(n_features): + if self.infrequent_indices_[feature_idx].size > 0: + mapping = self._infrequent_mappings[feature_idx] + to_drop[feature_idx] = mapping[to_drop[feature_idx]] - if not isinstance(self.drop, str): # drop is not 'first' - for i in range(to_drop.shape[1]): - to_drop[0][i] = hello[i][to_drop[0][i]] + # We remove all the dropped categories from mask, and decrement + # all categories that occur after them to avoid an empty column. + to_drop = to_drop.reshape(1, -1) keep_cells = X_int != to_drop X_mask &= keep_cells X_int[X_int > to_drop] -= 1 - n_values = [len(cats) - len(inf) for (cats, inf) in zip(self.categories_, self.infrequent_)] - else: - n_values = [len(cats) - len(inf) + 1 for (cats, inf) in zip(self.categories_, self.infrequent_)] + + n_columns = [len(cats) for cats in self.categories_] + # update n_columns if there are infrequent categories, and if some of + # them have been dropped + for feature_idx, infrequent_idx in enumerate(self.infrequent_indices_): + if self.drop is not None: + n_columns[feature_idx] -= 1 + n_infrequent = infrequent_idx.size + if n_infrequent > 0: + # still add 1 for the infrequent column + n_columns[feature_idx] += 1 - n_infrequent mask = X_mask.ravel() - n_values = np.array([0] + n_values) + n_values = np.array([0] + n_columns) feature_indices = np.cumsum(n_values) indices = (X_int + feature_indices[:-1]).ravel()[mask] indptr = X_mask.sum(axis=1).cumsum() @@ -912,6 +941,10 @@ class OrdinalEncoder(_BaseEncoder): (in order of the features in X and corresponding with the output of ``transform``). + infrequent_indices_: list of arrays of shape(n_infrequent_categories) + ``infrequent_indices_[i]`` contains a list of indices in + ``categories_[i]`` corresponsing to the infrequent categories. + Examples -------- Given a dataset with two features, we let the encoder find the unique @@ -922,7 +955,8 @@ class OrdinalEncoder(_BaseEncoder): >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] >>> enc.fit(X) ... # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + max_levels=None) >>> enc.categories_ [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] >>> enc.transform([['Female', 3], ['Male', 1]]) @@ -941,9 +975,10 @@ class OrdinalEncoder(_BaseEncoder): between 0 and n_classes-1. """ - def __init__(self, categories='auto', dtype=np.float64): + def __init__(self, categories='auto', dtype=np.float64, max_levels=None): self.categories = categories self.dtype = dtype + self.max_levels=max_levels def fit(self, X, y=None): """Fit the OrdinalEncoder to X. @@ -960,6 +995,7 @@ def fit(self, X, y=None): """ # base classes uses _categories to deal with deprecations in # OneHoteEncoder: can be removed once deprecations are removed + # XXX tag 0.22 self._categories = self.categories self._fit(X) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index db201dcd58c15..eaa243437ad08 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -839,3 +839,95 @@ def test_categories(density, drop): assert cat_list[drop_idx] == drop_cat assert isinstance(ohe_test.drop_idx_, np.ndarray) assert ohe_test.drop_idx_.dtype == np.int_ + + +def test_infrequent_categories_sanity(): + # Not a super legit test for now. + # Mostly aimed at explaining how the infrequent categories are handled. + + X = [[0, 0, 1], + [1, 0, 0], + [3, 5, 1], + [3, 1, 0], + [3, 1, 1], + [3, 2, 0], + [1, 5, 1], + [0, 5, 0], + [3, 0, 1]] + X = np.array(X) + + # Check _infrequent_idx_ attribute + oe = OrdinalEncoder(max_levels=2) + X_trans = oe.fit_transform(X) + # first feature: category 0 is infrequent + # note that 1 is also infrequent but we want to keep 2 categories + assert len(oe.infrequent_indices_[0]) == 1 + assert oe.categories_[0][oe.infrequent_indices_[0][0]] == 0 + # second feature: categories 2 and 1 are infrequent + # 2 comes first because it has less occurrences than 1 + assert len(oe.infrequent_indices_[1]) == 2 + assert oe.categories_[1][oe.infrequent_indices_[1][0]] == 2 + assert oe.categories_[1][oe.infrequent_indices_[1][1]] == 1 + # third feature: no infrequent category + assert len(oe.infrequent_indices_[2]) == 0 + + # For ordinal encoder, the infrequent categories are assigned the highest + # integer. + expected_X_trans = [[2, 0, 1], + [0, 0, 0], + [1, 1, 1], + [1, 2, 0], + [1, 2, 1], + [1, 2, 0], + [0, 1, 1], + [2, 1, 0], + [1, 0, 1]] + assert np.array_equal(X_trans, expected_X_trans) + + ohe = OneHotEncoder(categories='auto', max_levels=2) + X_trans = ohe.fit_transform(X).toarray() + # first feature: 1 is treated as infrequent and ends up in + # X_trans[:, 2] + # second feature: 1 and 2 are treated as infrequent and end up in + # X_trans[:, 5] + # third feature: no infrequent category. Represented by the 2 last + # columns + expected_X_trans = [[0, 0, 1, 1, 0, 0, 0, 1], + [1, 0, 0, 1, 0, 0, 1, 0], + [0, 1, 0, 0, 1, 0, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + [0, 1, 0, 0, 0, 1, 0, 1], + [0, 1, 0, 0, 0, 1, 1, 0], + [1, 0, 0, 0, 1, 0, 0, 1], + [0, 0, 1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 1]] + + assert np.array_equal(X_trans, expected_X_trans) + + # Dropping the first column works as expected + ohe = OneHotEncoder(categories='auto', max_levels=2, drop='first') + X_trans = ohe.fit_transform(X).toarray() + expected_X_trans = [[0, 1, 0, 0, 1], + [0, 0, 0, 0, 0], + [1, 0, 1, 0, 1], + [1, 0, 0, 1, 0], + [1, 0, 0, 1, 1], + [1, 0, 0, 1, 0], + [0, 0, 1, 0, 1], + [0, 1, 1, 0, 0], + [1, 0, 0, 0, 1]] + assert np.array_equal(X_trans, expected_X_trans) + + # Dropping explicit categories works as expected + ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 5, 1]) + X_trans = ohe.fit_transform(X).toarray() + expected_X_trans = [[0, 1, 1, 0, 0], + [1, 0, 1, 0, 1], + [0, 0, 0, 0, 0], + [0, 0, 0, 1, 1], + [0, 0, 0, 1, 0], + [0, 0, 0, 1, 1], + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 1], + [0, 0, 1, 0, 0]] + assert np.array_equal(X_trans, expected_X_trans) From 0533761f17bb52d5f19e27152ca9f5e95305f9da Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 8 May 2019 13:20:37 -0400 Subject: [PATCH 5/9] added support for drop='infrequent' --- sklearn/preprocessing/_encoders.py | 77 ++++++++++++++------ sklearn/preprocessing/tests/test_encoders.py | 22 ++++++ 2 files changed, 78 insertions(+), 21 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 17232c8707cc6..524c6fff2c323 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -154,9 +154,9 @@ def _transform(self, X, handle_unknown='error'): # infrequent categories to end up in a specific column, after all the # frequent ones. Let's say we have 4 categories with 2 infrequent # categories (and 2 frequent categories): we want the value in X_int - # for the infrequent categories to be 2 (third column), and the values - # for the frequent ones to be 0 and 1. The piece of code below - # performs this mapping. + # for the infrequent categories to be 2 (third and last column), and + # the values for the frequent ones to be 0 and 1. The piece of code + # below performs this mapping. # TODO: maybe integrate this part with the one above self._infrequent_mappings = {} huge_int = np.iinfo(X_int.dtype).max @@ -532,12 +532,28 @@ def fit(self, X, y=None): else: self._fit(X, handle_unknown=self.handle_unknown) self.drop_idx_ = self._compute_drop_idx() + + # check if user wants to manually drop a feature that is + # infrequent: this is not allowed + if self.drop is not None and not isinstance(self.drop, str): + for feature_idx, (infrequent_indices, drop_idx) in enumerate( + zip(self.infrequent_indices_, self.drop_idx_)): + if drop_idx in infrequent_indices: + raise ValueError( + "Category {} of feature {} is infrequent and thus " + "cannot be dropped. Use drop='infrequent' " + "instead.".format( + self.categories_[feature_idx][drop_idx], + feature_idx + ) + ) return self def _compute_drop_idx(self): if self.drop is None: return None - elif (isinstance(self.drop, str) and self.drop == 'first'): + elif (isinstance(self.drop, str) and + self.drop in ('first', 'infrequent')): return np.zeros(len(self.categories_), dtype=np.int_) elif not isinstance(self.drop, str): try: @@ -720,12 +736,43 @@ def _transform_new(self, X): X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown) n_samples, n_features = X_int.shape + # n_columns indicates, for each feature, how many columns are used in + # X_trans. By default this corresponds to the number of categories, but + # will differ if we drop some of them, or if there are infrequent + # categories (all mapped to the same column) + n_columns = [len(cats) for cats in self.categories_] + for feature_idx in range(n_features): + n_infrequent = self.infrequent_indices_[feature_idx].size + if n_infrequent > 0: + # still add 1 for the infrequent column + n_columns[feature_idx] += 1 - n_infrequent + if self.drop is not None: + # if drop is not None we always drop one column in general, + # except when drop is 'infrequent' and there is no infrequent + # category. + n_columns[feature_idx] -= 1 + if (isinstance(self.drop, str) and self.drop == 'infrequent' + and n_infrequent == 0): + n_columns[feature_idx] += 1 # revert decrement from above + if self.drop is not None: to_drop = self.drop_idx_.copy() - if not isinstance(self.drop, str): - # if drop is not 'first', we need to remap the dropped indexes - # if some of the categories are infrequent. + if isinstance(self.drop, str): + if self.drop == 'infrequent': + for feature_idx in range(n_features): + if self.infrequent_indices_[feature_idx].size > 0: + # drop the infrequent column (i.e. the last one) + to_drop[feature_idx] = n_columns[feature_idx] + else: + # no infrequent category, use special marker -1 + # so that no dropping happens for this feature + to_drop[feature_idx] = -1 + else: + # self.drop is an array of categories + # we need to remap the dropped indexes if some of the + # categories are infrequent. see _transform() for details + # about the mapping. for feature_idx in range(n_features): if self.infrequent_indices_[feature_idx].size > 0: mapping = self._infrequent_mappings[feature_idx] @@ -733,22 +780,10 @@ def _transform_new(self, X): # We remove all the dropped categories from mask, and decrement # all categories that occur after them to avoid an empty column. - to_drop = to_drop.reshape(1, -1) - keep_cells = X_int != to_drop + keep_cells = (X_int != to_drop) | (to_drop == -1) X_mask &= keep_cells - X_int[X_int > to_drop] -= 1 - - n_columns = [len(cats) for cats in self.categories_] - # update n_columns if there are infrequent categories, and if some of - # them have been dropped - for feature_idx, infrequent_idx in enumerate(self.infrequent_indices_): - if self.drop is not None: - n_columns[feature_idx] -= 1 - n_infrequent = infrequent_idx.size - if n_infrequent > 0: - # still add 1 for the infrequent column - n_columns[feature_idx] += 1 - n_infrequent + X_int[(X_int > to_drop) & (to_drop != -1)] -= 1 mask = X_mask.ravel() n_values = np.array([0] + n_columns) diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py index eaa243437ad08..d1f8a78659a8d 100644 --- a/sklearn/preprocessing/tests/test_encoders.py +++ b/sklearn/preprocessing/tests/test_encoders.py @@ -931,3 +931,25 @@ def test_infrequent_categories_sanity(): [0, 1, 0, 0, 1], [0, 0, 1, 0, 0]] assert np.array_equal(X_trans, expected_X_trans) + + # Dropping the infrequent categories works as expected + ohe = OneHotEncoder(categories='auto', max_levels=2, drop='infrequent') + X_trans = ohe.fit_transform(X).toarray() + expected_X_trans = [[0, 0, 1, 0, 0, 1], + [1, 0, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 1], + [0, 1, 0, 0, 1, 0], + [0, 1, 0, 0, 0, 1], + [0, 1, 0, 0, 1, 0], + [1, 0, 0, 1, 0, 1], + [0, 0, 0, 1, 1, 0], + [0, 1, 1, 0, 0, 1]] + + assert np.array_equal(X_trans, expected_X_trans) + + # Manually dropping a category that is infrequent is not allowed + ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 1, 1]) + err_msg = ("Category 1 of feature 1 is infrequent and thus cannot be " + "dropped") + with pytest.raises(ValueError, match=err_msg): + X_trans = ohe.fit(X) From 99352b6bfba4e37154252a7fc4aeb2ea85eb252b Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 8 May 2019 13:23:38 -0400 Subject: [PATCH 6/9] comment --- sklearn/preprocessing/_encoders.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 524c6fff2c323..1bb7e7d8a8ddc 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -752,7 +752,7 @@ def _transform_new(self, X): # category. n_columns[feature_idx] -= 1 if (isinstance(self.drop, str) and self.drop == 'infrequent' - and n_infrequent == 0): + and n_infrequent == 0): n_columns[feature_idx] += 1 # revert decrement from above if self.drop is not None: @@ -769,10 +769,9 @@ def _transform_new(self, X): # so that no dropping happens for this feature to_drop[feature_idx] = -1 else: - # self.drop is an array of categories - # we need to remap the dropped indexes if some of the - # categories are infrequent. see _transform() for details - # about the mapping. + # self.drop is an array of categories. we need to remap the + # dropped indexes if some of the categories are infrequent. + # see _transform() for details about the mapping. for feature_idx in range(n_features): if self.infrequent_indices_[feature_idx].size > 0: mapping = self._infrequent_mappings[feature_idx] From 8a3b8273878bb4a7adf73392aa505b6dc85a33a6 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Wed, 8 May 2019 14:22:19 -0400 Subject: [PATCH 7/9] pep8 --- doc/modules/preprocessing.rst | 9 +++++---- sklearn/preprocessing/_encoders.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst index 4c68f9e635498..346ece76d42b7 100644 --- a/doc/modules/preprocessing.rst +++ b/doc/modules/preprocessing.rst @@ -482,7 +482,8 @@ new feature of integers (0 to n_categories - 1):: >>> enc = preprocessing.OrdinalEncoder() >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] >>> enc.fit(X) # doctest: +ELLIPSIS - OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>) + OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>, + max_levels=None) >>> enc.transform([['female', 'from US', 'uses Safari']]) array([[0., 1., 1.]]) @@ -506,7 +507,7 @@ Continuing the example above:: >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features=None, categories=None, drop=None, dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values=None, sparse=True) + max_levels=None, n_values=None, sparse=True) >>> enc.transform([['female', 'from US', 'uses Safari'], ... ['male', 'from Europe', 'uses Safari']]).toarray() array([[1., 0., 0., 1., 0., 1.], @@ -533,7 +534,7 @@ dataset:: OneHotEncoder(categorical_features=None, categories=[...], drop=None, dtype=<... 'numpy.float64'>, handle_unknown='error', - n_values=None, sparse=True) + max_levels=None, n_values=None, sparse=True) >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray() array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]]) @@ -550,7 +551,7 @@ columns for this feature will be all zeros >>> enc.fit(X) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE OneHotEncoder(categorical_features=None, categories=None, drop=None, dtype=<... 'numpy.float64'>, handle_unknown='ignore', - n_values=None, sparse=True) + max_levels=None, n_values=None, sparse=True) >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray() array([[1., 0., 0., 0., 0., 0.]]) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index 1bb7e7d8a8ddc..49771512eeadc 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -1012,7 +1012,7 @@ class OrdinalEncoder(_BaseEncoder): def __init__(self, categories='auto', dtype=np.float64, max_levels=None): self.categories = categories self.dtype = dtype - self.max_levels=max_levels + self.max_levels = max_levels def fit(self, X, y=None): """Fit the OrdinalEncoder to X. From e110419f89cab558c93b75879c1ef34eab6f2826 Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Sun, 4 Aug 2019 10:48:23 -0400 Subject: [PATCH 8/9] pep8 --- sklearn/preprocessing/_encoders.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index d6e46740661b4..fdd8ec1012c77 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -10,7 +10,6 @@ from ..utils.fixes import _argmax from ..utils.validation import check_is_fitted -from .base import _transform_selected from .label import _encode, _encode_check_unknown, _encode_numpy From 69b738f24ba949a878c912d8cd8a7b9b23bec07e Mon Sep 17 00:00:00 2001 From: Nicolas Hug Date: Tue, 6 Aug 2019 09:08:13 -0400 Subject: [PATCH 9/9] Added docstring for max_levels --- sklearn/preprocessing/_encoders.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py index fdd8ec1012c77..1036e7b67d4a9 100644 --- a/sklearn/preprocessing/_encoders.py +++ b/sklearn/preprocessing/_encoders.py @@ -2,6 +2,8 @@ # Joris Van den Bossche # License: BSD 3 clause +from numbers import Integral + import numpy as np from scipy import sparse @@ -78,6 +80,13 @@ def _fit(self, X, handle_unknown='error'): raise ValueError("Shape mismatch: if categories is an array," " it has to be of shape (n_features,).") + if self.max_levels is not None: + if (not isinstance(self.max_levels, Integral) or + self.max_levels <= 0): + raise ValueError("max_levels must be None or a strictly " + "positive int, got {}.".format( + self.max_levels)) + self.categories_ = [] self.infrequent_indices_ = [] @@ -229,7 +238,11 @@ class OneHotEncoder(_BaseEncoder): - 'first' : drop the first category in each feature. If only one category is present, the feature will be dropped entirely. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that - should be dropped. + should be dropped. If ``drop[i]`` is an infrequent category, an + error is raised: it is only possible to drop all of the infrequent + categories, not just one of them. + - 'infrequent' : drop the infrequent categories column (see + ``max_levels`` parameter). sparse : boolean, default=True Will return sparse matrix if set True else will return an array. @@ -245,6 +258,10 @@ class OneHotEncoder(_BaseEncoder): will be all zeros. In the inverse transform, an unknown category will be denoted as None. + max_levels : int, default=None + Maximum number of categories to keep. Infrequent categories are + grouped together and mapped into a single column. + Attributes ---------- categories_ : list of arrays @@ -642,6 +659,10 @@ class OrdinalEncoder(_BaseEncoder): dtype : number type, default np.float64 Desired dtype of output. + max_levels : int, default=None + Maximum number of categories to keep. Infrequent categories are + grouped together and mapped to the highest int. + Attributes ---------- categories_ : list of arrays