diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 094100cd7024c..2a6c66446c3a2 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -20,6 +20,7 @@ from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var from ..utils.fixes import combinations_with_replacement as combinations_w_r +from ..utils.fixes import bincount from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, @@ -1140,7 +1141,7 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin): Attributes ---------- - powers_ : array, shape (n_input_features, n_output_features) + powers_ : array, shape (n_output_features, n_input_features) powers_[i, j] is the exponent of the jth input in the ith output. n_input_features_ : int @@ -1179,9 +1180,39 @@ def powers_(self): combinations = self._combinations(self.n_input_features_, self.degree, self.interaction_only, self.include_bias) - return np.vstack(np.bincount(c, minlength=self.n_input_features_) + return np.vstack(bincount(c, minlength=self.n_input_features_) for c in combinations) + def get_feature_names(self, input_features=None): + """ + Return feature names for output features + + Parameters + ---------- + input_features : list of string, length n_features, optional + String names for input features if available. By default, + "x0", "x1", ... "xn_features" is used. + + Returns + ------- + output_feature_names : list of string, length n_output_features + + """ + powers = self.powers_ + if input_features is None: + input_features = ['x%d' % i for i in range(powers.shape[1])] + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join("%s^%d" % (input_features[ind], exp) + if exp != 1 else input_features[ind] + for ind, exp in zip(inds, row[inds])) + else: + name = "1" + feature_names.append(name) + return feature_names + def fit(self, X, y=None): """ Compute number of output features. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index baa62cbbaffe2..c741526de03ad 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -10,6 +10,7 @@ import numpy.linalg as la from scipy import sparse from distutils.version import LooseVersion +from sklearn.externals.six import u from sklearn.utils import gen_batches @@ -118,6 +119,30 @@ def test_polynomial_features(): X_poly = interact.fit_transform(X) assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) + assert_equal(interact.powers_.shape, (interact.n_output_features_, + interact.n_input_features_)) + + +def test_polynomial_feature_names(): + X = np.arange(30).reshape(10, 3) + poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) + feature_names = poly.get_feature_names() + assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', + 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], + feature_names) + + poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) + feature_names = poly.get_feature_names(["a", "b", "c"]) + assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', + 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', + 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', + 'b c^2', 'c^3'], feature_names) + # test some unicode + poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) + feature_names = poly.get_feature_names([u"\u0001F40D", u"\u262E", u"\u05D0"]) + assert_array_equal([u"1", u"\u0001F40D", u"\u262E", u"\u05D0"], + feature_names) + def test_standard_scaler_1d(): # Test scaling of dataset along single axis