From 9dda91997aced54a069d3f0936f7d09433ba7dd7 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 6 May 2019 12:03:00 -0400
Subject: [PATCH 1/9] WIP

---
 sklearn/preprocessing/_encoders.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 6d11e7907984a..474e2ab4c2e01 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -85,6 +85,7 @@ def _fit(self, X, handle_unknown='error'):
                                  " it has to be of shape (n_features,).")
 
         self.categories_ = []
+        self.unfrequent_= []
 
         for i in range(n_features):
             Xi = X_list[i]
@@ -103,6 +104,9 @@ def _fit(self, X, handle_unknown='error'):
                                " during fit".format(diff, i))
                         raise ValueError(msg)
             self.categories_.append(cats)
+            self.unfrequent_.append(self._find_unfrequent_categories(Xi))
+
+    def _find_unfrequent_categories(self):
 
     def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
@@ -317,7 +321,7 @@ class OneHotEncoder(_BaseEncoder):
 
     def __init__(self, n_values=None, categorical_features=None,
                  categories=None, drop=None, sparse=True, dtype=np.float64,
-                 handle_unknown='error'):
+                 handle_unknown='error', max_levels=None):
         self.categories = categories
         self.sparse = sparse
         self.dtype = dtype
@@ -325,6 +329,7 @@ def __init__(self, n_values=None, categorical_features=None,
         self.n_values = n_values
         self.categorical_features = categorical_features
         self.drop = drop
+        self.max_levels = max_levels
 
     # Deprecated attributes
 

From 758191fe2d8d59c6febfd6c487fab995c7769e12 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Mon, 6 May 2019 14:25:47 -0400
Subject: [PATCH 2/9] WIP

---
 sklearn/preprocessing/_encoders.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 474e2ab4c2e01..212e7b8cf8ceb 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -106,7 +106,10 @@ def _fit(self, X, handle_unknown='error'):
             self.categories_.append(cats)
             self.unfrequent_.append(self._find_unfrequent_categories(Xi))
 
-    def _find_unfrequent_categories(self):
+    def _find_unfrequent_categories(self, Xi):
+        unique, counts= np.unique(Xi, return_counts=True)
+        indices = np.argsort(counts)[-self.max_levels:]
+        return unique[indices]
 
     def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)

From 4cff102668d67c6094b6e537d104f59aeb947b62 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 7 May 2019 16:59:21 -0400
Subject: [PATCH 3/9] WIP

---
 sklearn/preprocessing/_encoders.py | 36 +++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 212e7b8cf8ceb..568097b43b6ba 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -85,7 +85,9 @@ def _fit(self, X, handle_unknown='error'):
                                  " it has to be of shape (n_features,).")
 
         self.categories_ = []
-        self.unfrequent_= []
+        self.infrequent_= []
+        self._is_infrequent = []
+        self.infrequent_indices_ = []
 
         for i in range(n_features):
             Xi = X_list[i]
@@ -104,12 +106,14 @@ def _fit(self, X, handle_unknown='error'):
                                " during fit".format(diff, i))
                         raise ValueError(msg)
             self.categories_.append(cats)
-            self.unfrequent_.append(self._find_unfrequent_categories(Xi))
+            inf, indices = self._find_infrequent_categories(Xi)
+            self.infrequent_.append(inf)
+            self.infrequent_indices_.append(indices)
 
-    def _find_unfrequent_categories(self, Xi):
-        unique, counts= np.unique(Xi, return_counts=True)
-        indices = np.argsort(counts)[-self.max_levels:]
-        return unique[indices]
+    def _find_infrequent_categories(self, Xi):
+        unique, counts = np.unique(Xi, return_counts=True)
+        indices = np.argsort(counts)[:-self.max_levels]
+        return unique[indices], indices
 
     def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
@@ -685,6 +689,18 @@ def _transform_new(self, X):
         """New implementation assuming categorical input"""
         # validation of X happens in _check_X called by _transform
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
+        hello = []
+        for feature_idx in range(X_int.shape[1]):
+            col = X_int[:, feature_idx]
+            if self.infrequent_[feature_idx].size > 0:
+                mapping = np.arange(len(self.categories_[feature_idx]))
+                for i in self.infrequent_indices_[feature_idx]:
+                    mapping[i] = np.iinfo(col.dtype).max
+
+                from .label import _encode_numpy
+                _, encoded_mapping = _encode_numpy(mapping, encode=True)
+                col[:] = encoded_mapping[col]
+                hello.append(encoded_mapping)
 
         n_samples, n_features = X_int.shape
 
@@ -694,12 +710,16 @@ def _transform_new(self, X):
             # We remove all the dropped categories from mask, and decrement all
             # categories that occur after them to avoid an empty column.
 
+            if not isinstance(self.drop, str):  # drop is not 'first'
+                for i in range(to_drop.shape[1]):
+                    to_drop[0][i] = hello[i][to_drop[0][i]]
+
             keep_cells = X_int != to_drop
             X_mask &= keep_cells
             X_int[X_int > to_drop] -= 1
-            n_values = [len(cats) - 1 for cats in self.categories_]
+            n_values = [len(cats) - len(inf) for (cats, inf) in zip(self.categories_, self.infrequent_)]
         else:
-            n_values = [len(cats) for cats in self.categories_]
+            n_values = [len(cats) - len(inf) + 1 for (cats, inf) in zip(self.categories_, self.infrequent_)]
 
         mask = X_mask.ravel()
         n_values = np.array([0] + n_values)

From d2a1a06bae3f09be489efc73f0e5008be4b0c47b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 8 May 2019 11:45:59 -0400
Subject: [PATCH 4/9] some tests

---
 sklearn/preprocessing/_encoders.py           | 108 ++++++++++++-------
 sklearn/preprocessing/tests/test_encoders.py |  92 ++++++++++++++++
 2 files changed, 164 insertions(+), 36 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 568097b43b6ba..17232c8707cc6 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -16,7 +16,7 @@
 from ..utils.validation import check_is_fitted
 
 from .base import _transform_selected
-from .label import _encode, _encode_check_unknown
+from .label import _encode, _encode_check_unknown, _encode_numpy
 
 
 __all__ = [
@@ -85,8 +85,6 @@ def _fit(self, X, handle_unknown='error'):
                                  " it has to be of shape (n_features,).")
 
         self.categories_ = []
-        self.infrequent_= []
-        self._is_infrequent = []
         self.infrequent_indices_ = []
 
         for i in range(n_features):
@@ -106,14 +104,18 @@ def _fit(self, X, handle_unknown='error'):
                                " during fit".format(diff, i))
                         raise ValueError(msg)
             self.categories_.append(cats)
-            inf, indices = self._find_infrequent_categories(Xi)
-            self.infrequent_.append(inf)
-            self.infrequent_indices_.append(indices)
 
-    def _find_infrequent_categories(self, Xi):
-        unique, counts = np.unique(Xi, return_counts=True)
-        indices = np.argsort(counts)[:-self.max_levels]
-        return unique[indices], indices
+            if self.max_levels is not None:
+                infrequent_indices = self._find_infrequent_category_indices(Xi)
+            else:
+                infrequent_indices = np.array([])
+            self.infrequent_indices_.append(infrequent_indices)
+
+    def _find_infrequent_category_indices(self, Xi):
+        # TODO: this is using unique on X again. Ideally we should integrate
+        # this into _encode()
+        _, counts = np.unique(Xi, return_counts=True)
+        return np.argsort(counts)[:-self.max_levels]
 
     def _transform(self, X, handle_unknown='error'):
         X_list, n_samples, n_features = self._check_X(X)
@@ -148,6 +150,29 @@ def _transform(self, X, handle_unknown='error'):
             _, encoded = _encode(Xi, self.categories_[i], encode=True)
             X_int[:, i] = encoded
 
+        # We need to take care of infrequent categories here. We want all the
+        # infrequent categories to end up in a specific column, after all the
+        # frequent ones. Let's say we have 4 categories with 2 infrequent
+        # categories (and 2 frequent categories): we want the value in X_int
+        # for the infrequent categories to be 2 (third column), and the values
+        # for the frequent ones to be 0 and 1. The piece of code below
+        # performs this mapping.
+        # TODO: maybe integrate this part with the one above
+        self._infrequent_mappings = {}
+        huge_int = np.iinfo(X_int.dtype).max
+        for feature_idx in range(n_features):
+            if self.infrequent_indices_[feature_idx].size > 0:
+                mapping = np.arange(len(self.categories_[feature_idx]))
+                # Trick: set the infrequent cats columns to a very big int and
+                # encode again.
+                for ordinal_cat in self.infrequent_indices_[feature_idx]:
+                    mapping[ordinal_cat] = huge_int
+                _, mapping = _encode_numpy(mapping, encode=True)
+
+                # update X_int and save mapping for later (for dropping logic)
+                X_int[:, feature_idx] = mapping[X_int[:, feature_idx]]
+                self._infrequent_mappings[feature_idx] = mapping
+
         return X_int, X_mask
 
 
@@ -254,6 +279,10 @@ class OneHotEncoder(_BaseEncoder):
         be dropped for each feature. None if all the transformed features will
         be retained.
 
+    infrequent_indices_: list of arrays of shape(n_infrequent_categories)
+        ``infrequent_indices_[i]`` contains a list of indices in
+        ``categories_[i]`` corresponsing to the infrequent categories.
+
     active_features_ : array
         Indices for active features, meaning values that actually occur
         in the training set. Only available when n_values is ``'auto'``.
@@ -292,7 +321,7 @@ class OneHotEncoder(_BaseEncoder):
     ... # doctest: +NORMALIZE_WHITESPACE
     OneHotEncoder(categorical_features=None, categories=None, drop=None,
        dtype=<... 'numpy.float64'>, handle_unknown='ignore',
-       n_values=None, sparse=True)
+       max_levels=None, n_values=None, sparse=True)
 
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
@@ -689,40 +718,40 @@ def _transform_new(self, X):
         """New implementation assuming categorical input"""
         # validation of X happens in _check_X called by _transform
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
-        hello = []
-        for feature_idx in range(X_int.shape[1]):
-            col = X_int[:, feature_idx]
-            if self.infrequent_[feature_idx].size > 0:
-                mapping = np.arange(len(self.categories_[feature_idx]))
-                for i in self.infrequent_indices_[feature_idx]:
-                    mapping[i] = np.iinfo(col.dtype).max
-
-                from .label import _encode_numpy
-                _, encoded_mapping = _encode_numpy(mapping, encode=True)
-                col[:] = encoded_mapping[col]
-                hello.append(encoded_mapping)
-
         n_samples, n_features = X_int.shape
 
         if self.drop is not None:
-            to_drop = self.drop_idx_.reshape(1, -1)
+            to_drop = self.drop_idx_.copy()
 
-            # We remove all the dropped categories from mask, and decrement all
-            # categories that occur after them to avoid an empty column.
+            if not isinstance(self.drop, str):
+                # if drop is not 'first', we need to remap the dropped indexes
+                # if some of the categories are infrequent.
+                for feature_idx in range(n_features):
+                    if self.infrequent_indices_[feature_idx].size > 0:
+                        mapping = self._infrequent_mappings[feature_idx]
+                        to_drop[feature_idx] = mapping[to_drop[feature_idx]]
 
-            if not isinstance(self.drop, str):  # drop is not 'first'
-                for i in range(to_drop.shape[1]):
-                    to_drop[0][i] = hello[i][to_drop[0][i]]
+            # We remove all the dropped categories from mask, and decrement
+            # all categories that occur after them to avoid an empty column.
 
+            to_drop = to_drop.reshape(1, -1)
             keep_cells = X_int != to_drop
             X_mask &= keep_cells
             X_int[X_int > to_drop] -= 1
-            n_values = [len(cats) - len(inf) for (cats, inf) in zip(self.categories_, self.infrequent_)]
-        else:
-            n_values = [len(cats) - len(inf) + 1 for (cats, inf) in zip(self.categories_, self.infrequent_)]
+
+        n_columns = [len(cats) for cats in self.categories_]
+        # update n_columns if there are infrequent categories, and if some of
+        # them have been dropped
+        for feature_idx, infrequent_idx in enumerate(self.infrequent_indices_):
+            if self.drop is not None:
+                n_columns[feature_idx] -= 1
+            n_infrequent = infrequent_idx.size
+            if n_infrequent > 0:
+                # still add 1 for the infrequent column
+                n_columns[feature_idx] += 1 - n_infrequent
 
         mask = X_mask.ravel()
-        n_values = np.array([0] + n_values)
+        n_values = np.array([0] + n_columns)
         feature_indices = np.cumsum(n_values)
         indices = (X_int + feature_indices[:-1]).ravel()[mask]
         indptr = X_mask.sum(axis=1).cumsum()
@@ -912,6 +941,10 @@ class OrdinalEncoder(_BaseEncoder):
         (in order of the features in X and corresponding with the output
         of ``transform``).
 
+    infrequent_indices_: list of arrays of shape(n_infrequent_categories)
+        ``infrequent_indices_[i]`` contains a list of indices in
+        ``categories_[i]`` corresponsing to the infrequent categories.
+
     Examples
     --------
     Given a dataset with two features, we let the encoder find the unique
@@ -922,7 +955,8 @@ class OrdinalEncoder(_BaseEncoder):
     >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
     >>> enc.fit(X)
     ... # doctest: +ELLIPSIS
-    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
+    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
+                   max_levels=None)
     >>> enc.categories_
     [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
     >>> enc.transform([['Female', 3], ['Male', 1]])
@@ -941,9 +975,10 @@ class OrdinalEncoder(_BaseEncoder):
       between 0 and n_classes-1.
     """
 
-    def __init__(self, categories='auto', dtype=np.float64):
+    def __init__(self, categories='auto', dtype=np.float64, max_levels=None):
         self.categories = categories
         self.dtype = dtype
+        self.max_levels=max_levels
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.
@@ -960,6 +995,7 @@ def fit(self, X, y=None):
         """
         # base classes uses _categories to deal with deprecations in
         # OneHoteEncoder: can be removed once deprecations are removed
+        # XXX tag 0.22
         self._categories = self.categories
         self._fit(X)
 
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index db201dcd58c15..eaa243437ad08 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -839,3 +839,95 @@ def test_categories(density, drop):
             assert cat_list[drop_idx] == drop_cat
     assert isinstance(ohe_test.drop_idx_, np.ndarray)
     assert ohe_test.drop_idx_.dtype == np.int_
+
+
+def test_infrequent_categories_sanity():
+    # Not a super legit test for now.
+    # Mostly aimed at explaining how the infrequent categories are handled.
+
+    X = [[0, 0, 1],
+         [1, 0, 0],
+         [3, 5, 1],
+         [3, 1, 0],
+         [3, 1, 1],
+         [3, 2, 0],
+         [1, 5, 1],
+         [0, 5, 0],
+         [3, 0, 1]]
+    X = np.array(X)
+
+    # Check _infrequent_idx_ attribute
+    oe = OrdinalEncoder(max_levels=2)
+    X_trans = oe.fit_transform(X)
+    # first feature: category 0 is infrequent
+    # note that 1 is also infrequent but we want to keep 2 categories
+    assert len(oe.infrequent_indices_[0]) == 1
+    assert oe.categories_[0][oe.infrequent_indices_[0][0]] == 0
+    # second feature: categories 2 and 1 are infrequent
+    # 2 comes first because it has less occurrences than 1
+    assert len(oe.infrequent_indices_[1]) == 2
+    assert oe.categories_[1][oe.infrequent_indices_[1][0]] == 2
+    assert oe.categories_[1][oe.infrequent_indices_[1][1]] == 1
+    # third feature: no infrequent category
+    assert len(oe.infrequent_indices_[2]) == 0
+
+    # For ordinal encoder, the infrequent categories are assigned the highest
+    # integer.
+    expected_X_trans = [[2, 0, 1],
+                        [0, 0, 0],
+                        [1, 1, 1],
+                        [1, 2, 0],
+                        [1, 2, 1],
+                        [1, 2, 0],
+                        [0, 1, 1],
+                        [2, 1, 0],
+                        [1, 0, 1]]
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    ohe = OneHotEncoder(categories='auto', max_levels=2)
+    X_trans = ohe.fit_transform(X).toarray()
+    # first feature: 1 is treated as infrequent and ends up in
+    # X_trans[:, 2]
+    # second feature: 1 and 2 are treated as infrequent and end up in
+    # X_trans[:, 5]
+    # third feature: no infrequent category. Represented by the 2 last
+    # columns
+    expected_X_trans = [[0, 0, 1,   1, 0, 0,   0, 1],
+                        [1, 0, 0,   1, 0, 0,   1, 0],
+                        [0, 1, 0,   0, 1, 0,   0, 1],
+                        [0, 1, 0,   0, 0, 1,   1, 0],
+                        [0, 1, 0,   0, 0, 1,   0, 1],
+                        [0, 1, 0,   0, 0, 1,   1, 0],
+                        [1, 0, 0,   0, 1, 0,   0, 1],
+                        [0, 0, 1,   0, 1, 0,   1, 0],
+                        [0, 1, 0,   1, 0, 0,   0, 1]]
+
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    # Dropping the first column works as expected
+    ohe = OneHotEncoder(categories='auto', max_levels=2, drop='first')
+    X_trans = ohe.fit_transform(X).toarray()
+    expected_X_trans = [[0, 1,   0, 0,   1],
+                        [0, 0,   0, 0,   0],
+                        [1, 0,   1, 0,   1],
+                        [1, 0,   0, 1,   0],
+                        [1, 0,   0, 1,   1],
+                        [1, 0,   0, 1,   0],
+                        [0, 0,   1, 0,   1],
+                        [0, 1,   1, 0,   0],
+                        [1, 0,   0, 0,   1]]
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    # Dropping explicit categories works as expected
+    ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 5, 1])
+    X_trans = ohe.fit_transform(X).toarray()
+    expected_X_trans = [[0, 1,   1, 0,   0],
+                        [1, 0,   1, 0,   1],
+                        [0, 0,   0, 0,   0],
+                        [0, 0,   0, 1,   1],
+                        [0, 0,   0, 1,   0],
+                        [0, 0,   0, 1,   1],
+                        [1, 0,   0, 0,   0],
+                        [0, 1,   0, 0,   1],
+                        [0, 0,   1, 0,   0]]
+    assert np.array_equal(X_trans, expected_X_trans)

From 0533761f17bb52d5f19e27152ca9f5e95305f9da Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 8 May 2019 13:20:37 -0400
Subject: [PATCH 5/9] added support for drop='infrequent'

---
 sklearn/preprocessing/_encoders.py           | 77 ++++++++++++++------
 sklearn/preprocessing/tests/test_encoders.py | 22 ++++++
 2 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 17232c8707cc6..524c6fff2c323 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -154,9 +154,9 @@ def _transform(self, X, handle_unknown='error'):
         # infrequent categories to end up in a specific column, after all the
         # frequent ones. Let's say we have 4 categories with 2 infrequent
         # categories (and 2 frequent categories): we want the value in X_int
-        # for the infrequent categories to be 2 (third column), and the values
-        # for the frequent ones to be 0 and 1. The piece of code below
-        # performs this mapping.
+        # for the infrequent categories to be 2 (third and last column), and
+        # the values for the frequent ones to be 0 and 1. The piece of code
+        # below performs this mapping.
         # TODO: maybe integrate this part with the one above
         self._infrequent_mappings = {}
         huge_int = np.iinfo(X_int.dtype).max
@@ -532,12 +532,28 @@ def fit(self, X, y=None):
         else:
             self._fit(X, handle_unknown=self.handle_unknown)
             self.drop_idx_ = self._compute_drop_idx()
+
+            # check if user wants to manually drop a feature that is
+            # infrequent: this is not allowed
+            if self.drop is not None and not isinstance(self.drop, str):
+                for feature_idx, (infrequent_indices, drop_idx) in enumerate(
+                        zip(self.infrequent_indices_, self.drop_idx_)):
+                    if drop_idx in infrequent_indices:
+                        raise ValueError(
+                            "Category {} of feature {} is infrequent and thus "
+                            "cannot be dropped. Use drop='infrequent' "
+                            "instead.".format(
+                                self.categories_[feature_idx][drop_idx],
+                                feature_idx
+                            )
+                        )
             return self
 
     def _compute_drop_idx(self):
         if self.drop is None:
             return None
-        elif (isinstance(self.drop, str) and self.drop == 'first'):
+        elif (isinstance(self.drop, str) and
+                self.drop in ('first', 'infrequent')):
             return np.zeros(len(self.categories_), dtype=np.int_)
         elif not isinstance(self.drop, str):
             try:
@@ -720,12 +736,43 @@ def _transform_new(self, X):
         X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
         n_samples, n_features = X_int.shape
 
+        # n_columns indicates, for each feature, how many columns are used in
+        # X_trans. By default this corresponds to the number of categories, but
+        # will differ if we drop some of them, or if there are infrequent
+        # categories (all mapped to the same column)
+        n_columns = [len(cats) for cats in self.categories_]
+        for feature_idx in range(n_features):
+            n_infrequent = self.infrequent_indices_[feature_idx].size
+            if n_infrequent > 0:
+                # still add 1 for the infrequent column
+                n_columns[feature_idx] += 1 - n_infrequent
+            if self.drop is not None:
+                # if drop is not None we always drop one column in general,
+                # except when drop is 'infrequent' and there is no infrequent
+                # category.
+                n_columns[feature_idx] -= 1
+                if (isinstance(self.drop, str) and self.drop == 'infrequent'
+                                               and n_infrequent == 0):
+                    n_columns[feature_idx] += 1  # revert decrement from above
+
         if self.drop is not None:
             to_drop = self.drop_idx_.copy()
 
-            if not isinstance(self.drop, str):
-                # if drop is not 'first', we need to remap the dropped indexes
-                # if some of the categories are infrequent.
+            if isinstance(self.drop, str):
+                if self.drop == 'infrequent':
+                    for feature_idx in range(n_features):
+                        if self.infrequent_indices_[feature_idx].size > 0:
+                            # drop the infrequent column (i.e. the last one)
+                            to_drop[feature_idx] = n_columns[feature_idx]
+                        else:
+                            # no infrequent category, use special marker -1
+                            # so that no dropping happens for this feature
+                            to_drop[feature_idx] = -1
+            else:
+                # self.drop is an array of categories
+                # we need to remap the dropped indexes if some of the
+                # categories are infrequent. see _transform() for details
+                # about the mapping.
                 for feature_idx in range(n_features):
                     if self.infrequent_indices_[feature_idx].size > 0:
                         mapping = self._infrequent_mappings[feature_idx]
@@ -733,22 +780,10 @@ def _transform_new(self, X):
 
             # We remove all the dropped categories from mask, and decrement
             # all categories that occur after them to avoid an empty column.
-
             to_drop = to_drop.reshape(1, -1)
-            keep_cells = X_int != to_drop
+            keep_cells = (X_int != to_drop) | (to_drop == -1)
             X_mask &= keep_cells
-            X_int[X_int > to_drop] -= 1
-
-        n_columns = [len(cats) for cats in self.categories_]
-        # update n_columns if there are infrequent categories, and if some of
-        # them have been dropped
-        for feature_idx, infrequent_idx in enumerate(self.infrequent_indices_):
-            if self.drop is not None:
-                n_columns[feature_idx] -= 1
-            n_infrequent = infrequent_idx.size
-            if n_infrequent > 0:
-                # still add 1 for the infrequent column
-                n_columns[feature_idx] += 1 - n_infrequent
+            X_int[(X_int > to_drop) & (to_drop != -1)] -= 1
 
         mask = X_mask.ravel()
         n_values = np.array([0] + n_columns)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
index eaa243437ad08..d1f8a78659a8d 100644
--- a/sklearn/preprocessing/tests/test_encoders.py
+++ b/sklearn/preprocessing/tests/test_encoders.py
@@ -931,3 +931,25 @@ def test_infrequent_categories_sanity():
                         [0, 1,   0, 0,   1],
                         [0, 0,   1, 0,   0]]
     assert np.array_equal(X_trans, expected_X_trans)
+
+    # Dropping the infrequent categories works as expected
+    ohe = OneHotEncoder(categories='auto', max_levels=2, drop='infrequent')
+    X_trans = ohe.fit_transform(X).toarray()
+    expected_X_trans = [[0, 0,   1, 0,   0, 1],
+                        [1, 0,   1, 0,   1, 0],
+                        [0, 1,   0, 1,   0, 1],
+                        [0, 1,   0, 0,   1, 0],
+                        [0, 1,   0, 0,   0, 1],
+                        [0, 1,   0, 0,   1, 0],
+                        [1, 0,   0, 1,   0, 1],
+                        [0, 0,   0, 1,   1, 0],
+                        [0, 1,   1, 0,   0, 1]]
+
+    assert np.array_equal(X_trans, expected_X_trans)
+
+    # Manually dropping a category that is infrequent is not allowed
+    ohe = OneHotEncoder(categories='auto', max_levels=2, drop=[3, 1, 1])
+    err_msg = ("Category 1 of feature 1 is infrequent and thus cannot be "
+               "dropped")
+    with pytest.raises(ValueError, match=err_msg):
+        X_trans = ohe.fit(X)

From 99352b6bfba4e37154252a7fc4aeb2ea85eb252b Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 8 May 2019 13:23:38 -0400
Subject: [PATCH 6/9] comment

---
 sklearn/preprocessing/_encoders.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 524c6fff2c323..1bb7e7d8a8ddc 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -752,7 +752,7 @@ def _transform_new(self, X):
                 # category.
                 n_columns[feature_idx] -= 1
                 if (isinstance(self.drop, str) and self.drop == 'infrequent'
-                                               and n_infrequent == 0):
+                        and n_infrequent == 0):
                     n_columns[feature_idx] += 1  # revert decrement from above
 
         if self.drop is not None:
@@ -769,10 +769,9 @@ def _transform_new(self, X):
                             # so that no dropping happens for this feature
                             to_drop[feature_idx] = -1
             else:
-                # self.drop is an array of categories
-                # we need to remap the dropped indexes if some of the
-                # categories are infrequent. see _transform() for details
-                # about the mapping.
+                # self.drop is an array of categories. we need to remap the
+                # dropped indexes if some of the categories are infrequent.
+                # see _transform() for details about the mapping.
                 for feature_idx in range(n_features):
                     if self.infrequent_indices_[feature_idx].size > 0:
                         mapping = self._infrequent_mappings[feature_idx]

From 8a3b8273878bb4a7adf73392aa505b6dc85a33a6 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Wed, 8 May 2019 14:22:19 -0400
Subject: [PATCH 7/9] pep8

---
 doc/modules/preprocessing.rst      | 9 +++++----
 sklearn/preprocessing/_encoders.py | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
index 4c68f9e635498..346ece76d42b7 100644
--- a/doc/modules/preprocessing.rst
+++ b/doc/modules/preprocessing.rst
@@ -482,7 +482,8 @@ new feature of integers (0 to n_categories - 1)::
     >>> enc = preprocessing.OrdinalEncoder()
     >>> X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
     >>> enc.fit(X)  # doctest: +ELLIPSIS
-    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>)
+    OrdinalEncoder(categories='auto', dtype=<... 'numpy.float64'>,
+                   max_levels=None)
     >>> enc.transform([['female', 'from US', 'uses Safari']])
     array([[0., 1., 1.]])
 
@@ -506,7 +507,7 @@ Continuing the example above::
   >>> enc.fit(X)  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
   OneHotEncoder(categorical_features=None, categories=None, drop=None,
          dtype=<... 'numpy.float64'>, handle_unknown='error',
-         n_values=None, sparse=True)
+         max_levels=None, n_values=None, sparse=True)
   >>> enc.transform([['female', 'from US', 'uses Safari'],
   ...                ['male', 'from Europe', 'uses Safari']]).toarray()
   array([[1., 0., 0., 1., 0., 1.],
@@ -533,7 +534,7 @@ dataset::
     OneHotEncoder(categorical_features=None,
            categories=[...], drop=None,
            dtype=<... 'numpy.float64'>, handle_unknown='error',
-           n_values=None, sparse=True)
+           max_levels=None, n_values=None, sparse=True)
     >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
     array([[1., 0., 0., 1., 0., 0., 1., 0., 0., 0.]])
 
@@ -550,7 +551,7 @@ columns for this feature will be all zeros
     >>> enc.fit(X) # doctest: +ELLIPSIS  +NORMALIZE_WHITESPACE
     OneHotEncoder(categorical_features=None, categories=None, drop=None,
            dtype=<... 'numpy.float64'>, handle_unknown='ignore',
-           n_values=None, sparse=True)
+           max_levels=None, n_values=None, sparse=True)
     >>> enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
     array([[1., 0., 0., 0., 0., 0.]])
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index 1bb7e7d8a8ddc..49771512eeadc 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -1012,7 +1012,7 @@ class OrdinalEncoder(_BaseEncoder):
     def __init__(self, categories='auto', dtype=np.float64, max_levels=None):
         self.categories = categories
         self.dtype = dtype
-        self.max_levels=max_levels
+        self.max_levels = max_levels
 
     def fit(self, X, y=None):
         """Fit the OrdinalEncoder to X.

From e110419f89cab558c93b75879c1ef34eab6f2826 Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Sun, 4 Aug 2019 10:48:23 -0400
Subject: [PATCH 8/9] pep8

---
 sklearn/preprocessing/_encoders.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index d6e46740661b4..fdd8ec1012c77 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -10,7 +10,6 @@
 from ..utils.fixes import _argmax
 from ..utils.validation import check_is_fitted
 
-from .base import _transform_selected
 from .label import _encode, _encode_check_unknown, _encode_numpy
 
 

From 69b738f24ba949a878c912d8cd8a7b9b23bec07e Mon Sep 17 00:00:00 2001
From: Nicolas Hug <contact@nicolas-hug.com>
Date: Tue, 6 Aug 2019 09:08:13 -0400
Subject: [PATCH 9/9] Added docstring for max_levels

---
 sklearn/preprocessing/_encoders.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
index fdd8ec1012c77..1036e7b67d4a9 100644
--- a/sklearn/preprocessing/_encoders.py
+++ b/sklearn/preprocessing/_encoders.py
@@ -2,6 +2,8 @@
 #          Joris Van den Bossche <jorisvandenbossche@gmail.com>
 # License: BSD 3 clause
 
+from numbers import Integral
+
 import numpy as np
 from scipy import sparse
 
@@ -78,6 +80,13 @@ def _fit(self, X, handle_unknown='error'):
                 raise ValueError("Shape mismatch: if categories is an array,"
                                  " it has to be of shape (n_features,).")
 
+        if self.max_levels is not None:
+            if (not isinstance(self.max_levels, Integral) or
+                    self.max_levels <= 0):
+                raise ValueError("max_levels must be None or a strictly "
+                                 "positive int, got {}.".format(
+                                     self.max_levels))
+
         self.categories_ = []
         self.infrequent_indices_ = []
 
@@ -229,7 +238,11 @@ class OneHotEncoder(_BaseEncoder):
         - 'first' : drop the first category in each feature. If only one
           category is present, the feature will be dropped entirely.
         - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
-          should be dropped.
+          should be dropped. If ``drop[i]`` is an infrequent category, an
+          error is raised: it is only possible to drop all of the infrequent
+          categories, not just one of them.
+        - 'infrequent' : drop the infrequent categories column (see
+          ``max_levels`` parameter).
 
     sparse : boolean, default=True
         Will return sparse matrix if set True else will return an array.
@@ -245,6 +258,10 @@ class OneHotEncoder(_BaseEncoder):
         will be all zeros. In the inverse transform, an unknown category
         will be denoted as None.
 
+    max_levels : int, default=None
+        Maximum number of categories to keep. Infrequent categories are
+        grouped together and mapped into a single column.
+
     Attributes
     ----------
     categories_ : list of arrays
@@ -642,6 +659,10 @@ class OrdinalEncoder(_BaseEncoder):
     dtype : number type, default np.float64
         Desired dtype of output.
 
+    max_levels : int, default=None
+        Maximum number of categories to keep. Infrequent categories are
+        grouped together and mapped to the highest int.
+
     Attributes
     ----------
     categories_ : list of arrays