commit 11

Nirvan101 · Nirvan101 · commit 68e3f92964b7 · 2017-12-31T16:19:31.000+05:30
rebased commit 8,9

commit 10
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -16,6 +16,7 @@
 import numpy as np
 from scipy import sparse
 from scipy import stats
+import sys
 
 from ..base import BaseEstimator, TransformerMixin
 from ..externals import six
@@ -2655,6 +2656,8 @@ class CategoricalEncoder(BaseEstimator, TransformerMixin):
     >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
     array([['Male', 1],
            [None, 2]], dtype=object)
+    >>> enc.get_feature_names()
+    ['x0_Female', 'x0_Male', 'x1_1', 'x1_2', 'x1_3']
 
     See also
     --------
@@ -2873,3 +2876,37 @@ def inverse_transform(self, X):
                     X_tr[mask, idx] = None
 
         return X_tr
+
+    def get_feature_names(self, input_features=None):
+        """Return feature names for output features
+
+        Parameters
+        ----------
+        input_features : list of string, length n_features, optional
+            String names for input features if available. By default,
+            "x0", "x1", ... "xn_features" is used.
+
+        Returns
+        -------
+        output_feature_names : list of string, length n_output_features
+
+        """
+        is_python3 = sys.version_info.major == 3
+        if is_python3:
+            unicode = str
+
+        cats = self.categories_
+        feature_names = []
+        if input_features is None:
+            input_features = ['x%d' % i for i in range(len(cats))]   
+        elif(len(input_features) != len(self.categories_)):
+            raise ValueError('input_features should have length equal to number of features')
+
+        def to_unicode(text):
+            #If text is unicode, it is returned as is. If it's str, convert it to Unicode using UTF-8 encoding
+            return text if isinstance(text, unicode) else text.encode('utf8')
+
+        for i in range(len(cats)):
+            feature_names.extend( to_unicode(input_features[i] + '_' + str(t)) for t in cats[i])
+
+        return feature_names
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -2224,3 +2224,30 @@ def test_quantile_transform_valid_axis():
 
     assert_raises_regex(ValueError, "axis should be either equal to 0 or 1"
                         ". Got axis=2", quantile_transform, X.T, axis=2)
+
+
+def test_categorical_encoder_feature_names():
+    enc = CategoricalEncoder()
+    X = [['Male', 1, 'girl', 2, 3],
+         ['Female', 41, 'girl', 1, 10],
+         ['Male', 51, 'boy', 12, 3],
+         ['Male', 91, 'girl', 21, 30]]
+
+    enc.fit(X)
+    feature_names = enc.get_feature_names()
+
+    assert_array_equal(['x0_Female', 'x0_Male',
+                        'x1_1', 'x1_41', 'x1_51', 'x1_91',
+                        'x2_boy', 'x2_girl',
+                        'x3_1', 'x3_2', 'x3_12', 'x3_21',
+                        'x4_3',
+                        'x4_10', 'x4_30'], feature_names)
+
+    feature_names2 = enc.get_feature_names(['one', 'two',
+                                            'three', 'four', 'five'])
+
+    assert_array_equal(['one_Female', 'one_Male',
+                        'two_1', 'two_41', 'two_51', 'two_91',
+                        'three_boy', 'three_girl',
+                        'four_1', 'four_2', 'four_12', 'four_21',
+                        'five_3', 'five_10', 'five_30'], feature_names2)