[MRG] Fixes get_feature_names results when using drop functionality (scikit-learn#13894)

jamesmyatt · jnothman · commit 45ab4c677203 · 2019-05-23T23:12:01.000+10:00
diff --git a/doc/whats_new/v0.21.rst b/doc/whats_new/v0.21.rst
@@ -21,10 +21,17 @@ Changelog
 :mod:`sklearn.metrics`
 ......................
 
-- |Fix| Fixed a bug in :func:`euclidean_distances` where a part of the distance
-  matrix was left un-instanciated for suffiently large float32 datasets
-  (regression introduced in 0.21) :issue:`13910`
-  by :user:`Jérémie du Boisberranger <jeremiedbb>`.
+- |Fix| Fixed a bug in :func:`metrics.euclidean_distances` where a part of the
+  distance matrix was left un-instanciated for suffiently large float32
+  datasets (regression introduced in 0.21). :pr:`13910` by :user:`Jérémie du
+  Boisberranger <jeremiedbb>`.
+
+:mod:`sklearn.preprocessing`
+............................
+
+- |Fix| Fixed a bug in :class:`preprocessing.OneHotEncoder` where the new
+  `drop` parameter was not reflected in `get_feature_names`. :pr:`13894`
+  by :user:`James Myatt <jamesmyatt>`.
 
 .. _changes_0_21_1:
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -848,6 +848,8 @@ def get_feature_names(self, input_features=None):
         for i in range(len(cats)):
             names = [
                 input_features[i] + '_' + str(t) for t in cats[i]]
+            if self.drop is not None:
+                names.pop(self.drop_idx_[i])
             feature_names.extend(names)
 
         return np.array(feature_names, dtype=object)
diff --git a/sklearn/preprocessing/tests/test_encoders.py b/sklearn/preprocessing/tests/test_encoders.py
@@ -590,6 +590,21 @@ def test_one_hot_encoder_feature_names_unicode():
     assert_array_equal(['n👍me_c❤t1', 'n👍me_dat2'], feature_names)
 
 
+@pytest.mark.parametrize("drop, expected_names",
+                         [('first', ['x0_c', 'x2_b']),
+                          (['c', 2, 'b'], ['x0_b', 'x2_a'])],
+                         ids=['first', 'manual'])
+def test_one_hot_encoder_feature_names_drop(drop, expected_names):
+    X = [['c', 2, 'a'],
+         ['b', 2, 'b']]
+
+    ohe = OneHotEncoder(drop=drop)
+    ohe.fit(X)
+    feature_names = ohe.get_feature_names()
+    assert isinstance(feature_names, np.ndarray)
+    assert_array_equal(expected_names, feature_names)
+
+
 @pytest.mark.parametrize("X", [np.array([[1, np.nan]]).T,
                                np.array([['a', np.nan]], dtype=object).T],
                          ids=['numeric', 'object'])