From 3fa684db0e44c5d3213566875f4011dd8083cfba Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 19 Feb 2016 15:52:55 -0500 Subject: [PATCH 1/4] add get_feature_names to PolynomialFeatures --- sklearn/preprocessing/data.py | 29 ++++++++++++++++++++++++ sklearn/preprocessing/tests/test_data.py | 22 ++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 094100cd7024c..5494ab1dd7a13 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1182,6 +1182,35 @@ def powers_(self): return np.vstack(np.bincount(c, minlength=self.n_input_features_) for c in combinations) + def get_feature_names(self, input_features=None): + """ + Return feature names for output features + + Parameters + ---------- + input_features : list of string, length n_features, optional + String names for input features if available. By default, + "x0", "x1", ... "xn_features" is used. + + Returns + ------- + output_feature_names : list of string, length n_output_features + + """ + powers = self.powers_ + if input_features is None: + input_features = ['x%d' % i for i in range(len(powers))] + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join("%s^%d" % (input_features[ind], exp) + for ind, exp in zip(inds, row[inds])) + else: + name = "1" + feature_names.append(name) + return feature_names + def fit(self, X, y=None): """ Compute number of output features. diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index baa62cbbaffe2..e3631564810d4 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -10,6 +10,7 @@ import numpy.linalg as la from scipy import sparse from distutils.version import LooseVersion +from sklearn.externals.six import u from sklearn.utils import gen_batches @@ -119,6 +120,27 @@ def test_polynomial_features(): assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) +def test_polynomial_feature_names(): + X = np.arange(30).reshape(10, 3) + poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) + feature_names = poly.get_feature_names() + assert_array_equal(['1', 'x0^1', 'x1^1', 'x2^1', 'x0^2', 'x0^1 x1^1', + 'x0^1 x2^1', 'x1^2', 'x1^1 x2^1', 'x2^2'], + feature_names) + + poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) + feature_names = poly.get_feature_names(["a", "b", "c"]) + assert_array_equal(['a^1', 'b^1', 'c^1', 'a^2', 'a^1 b^1', 'a^1 c^1', 'b^2', + 'b^1 c^1', 'c^2', 'a^3', 'a^2 b^1', 'a^2 c^1', + 'a^1 b^2', 'a^1 b^1 c^1', 'a^1 c^2', 'b^3', 'b^2 c^1', + 'b^1 c^2', 'c^3'], feature_names) + # test some unicode + poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) + feature_names = poly.get_feature_names([u"\u0001F40D", u"\u262E", u"\u05D0"]) + assert_array_equal([u"1", u"\u0001F40D^1", u"\u262E^1", u"\u05D0^1"], + feature_names) + + def test_standard_scaler_1d(): # Test scaling of dataset along single axis for X in [X_1row, X_1col, X_list_1row, X_list_1row]: From ddc1740207acbcd989352dd3f9a5c94e5ece9634 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Fri, 19 Feb 2016 15:53:37 -0500 Subject: [PATCH 2/4] fix PolynomialFeatures.powers_ in python 0.16.1 --- sklearn/preprocessing/data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 5494ab1dd7a13..0c950bcf66610 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -20,6 +20,7 @@ from ..utils.extmath import row_norms from ..utils.extmath import _incremental_mean_and_var from ..utils.fixes import combinations_with_replacement as combinations_w_r +from ..utils.fixes import bincount from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, inplace_csr_row_normalize_l2) from ..utils.sparsefuncs import (inplace_column_scale, @@ -1179,7 +1180,7 @@ def powers_(self): combinations = self._combinations(self.n_input_features_, self.degree, self.interaction_only, self.include_bias) - return np.vstack(np.bincount(c, minlength=self.n_input_features_) + return np.vstack(bincount(c, minlength=self.n_input_features_) for c in combinations) def get_feature_names(self, input_features=None): From 897f8b6cc6b347eff252185769fb7e9562fa9ace Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Mon, 22 Feb 2016 15:09:47 -0500 Subject: [PATCH 3/4] don't do ^1 --- sklearn/preprocessing/data.py | 1 + sklearn/preprocessing/tests/test_data.py | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 0c950bcf66610..0d0c4167f55cb 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1206,6 +1206,7 @@ def get_feature_names(self, input_features=None): inds = np.where(row)[0] if len(inds): name = " ".join("%s^%d" % (input_features[ind], exp) + if exp != 1 else input_features[ind] for ind, exp in zip(inds, row[inds])) else: name = "1" diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index e3631564810d4..195e80c30d428 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -124,20 +124,20 @@ def test_polynomial_feature_names(): X = np.arange(30).reshape(10, 3) poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) feature_names = poly.get_feature_names() - assert_array_equal(['1', 'x0^1', 'x1^1', 'x2^1', 'x0^2', 'x0^1 x1^1', - 'x0^1 x2^1', 'x1^2', 'x1^1 x2^1', 'x2^2'], + assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', + 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], feature_names) poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) feature_names = poly.get_feature_names(["a", "b", "c"]) - assert_array_equal(['a^1', 'b^1', 'c^1', 'a^2', 'a^1 b^1', 'a^1 c^1', 'b^2', - 'b^1 c^1', 'c^2', 'a^3', 'a^2 b^1', 'a^2 c^1', - 'a^1 b^2', 'a^1 b^1 c^1', 'a^1 c^2', 'b^3', 'b^2 c^1', - 'b^1 c^2', 'c^3'], feature_names) + assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', + 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', + 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', + 'b c^2', 'c^3'], feature_names) # test some unicode poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) feature_names = poly.get_feature_names([u"\u0001F40D", u"\u262E", u"\u05D0"]) - assert_array_equal([u"1", u"\u0001F40D^1", u"\u262E^1", u"\u05D0^1"], + assert_array_equal([u"1", u"\u0001F40D", u"\u262E", u"\u05D0"], feature_names) From 8fb928d6442fa92161db9b2a97a616e2625384b6 Mon Sep 17 00:00:00 2001 From: Andreas Mueller Date: Wed, 24 Feb 2016 17:08:15 -0500 Subject: [PATCH 4/4] fixed doc for powers, added test --- sklearn/preprocessing/data.py | 4 ++-- sklearn/preprocessing/tests/test_data.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 0d0c4167f55cb..2a6c66446c3a2 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -1141,7 +1141,7 @@ class PolynomialFeatures(BaseEstimator, TransformerMixin): Attributes ---------- - powers_ : array, shape (n_input_features, n_output_features) + powers_ : array, shape (n_output_features, n_input_features) powers_[i, j] is the exponent of the jth input in the ith output. n_input_features_ : int @@ -1200,7 +1200,7 @@ def get_feature_names(self, input_features=None): """ powers = self.powers_ if input_features is None: - input_features = ['x%d' % i for i in range(len(powers))] + input_features = ['x%d' % i for i in range(powers.shape[1])] feature_names = [] for row in powers: inds = np.where(row)[0] diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 195e80c30d428..c741526de03ad 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -119,6 +119,9 @@ def test_polynomial_features(): X_poly = interact.fit_transform(X) assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) + assert_equal(interact.powers_.shape, (interact.n_output_features_, + interact.n_input_features_)) + def test_polynomial_feature_names(): X = np.arange(30).reshape(10, 3)