diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 076b9e85e1150..6653088ba85a7 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -23,7 +23,6 @@ from ._data import quantile_transform from ._data import power_transform from ._data import PowerTransformer -from ._data import PolynomialFeatures from ._encoders import OneHotEncoder from ._encoders import OrdinalEncoder @@ -35,6 +34,7 @@ from ._discretization import KBinsDiscretizer +from ._polynomial import PolynomialFeatures from ._polynomial import SplineTransformer diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 29190dd6e2b67..5e85b932a1e39 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -8,9 +8,7 @@ # License: BSD 3 clause -from itertools import chain, combinations import warnings -from itertools import combinations_with_replacement as combinations_w_r import numpy as np from scipy import sparse @@ -31,7 +29,6 @@ from ..utils.validation import (check_is_fitted, check_random_state, _check_sample_weight, FLOAT_DTYPES, _deprecate_positional_args) -from ._csr_polynomial_expansion import _csr_polynomial_expansion from ._encoders import OneHotEncoder @@ -1570,293 +1567,6 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True, return X -class PolynomialFeatures(TransformerMixin, BaseEstimator): - """Generate polynomial and interaction features. - - Generate a new feature matrix consisting of all polynomial combinations - of the features with degree less than or equal to the specified degree. - For example, if an input sample is two dimensional and of the form - [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. - - Parameters - ---------- - degree : int, default=2 - The degree of the polynomial features. - - interaction_only : bool, default=False - If true, only interaction features are produced: features that are - products of at most ``degree`` *distinct* input features (so not - ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). - - include_bias : bool, default=True - If True (default), then include a bias column, the feature in which - all polynomial powers are zero (i.e. a column of ones - acts as an - intercept term in a linear model). - - order : {'C', 'F'}, default='C' - Order of output array in the dense case. 'F' order is faster to - compute, but may slow down subsequent estimators. - - .. versionadded:: 0.21 - - Examples - -------- - >>> import numpy as np - >>> from sklearn.preprocessing import PolynomialFeatures - >>> X = np.arange(6).reshape(3, 2) - >>> X - array([[0, 1], - [2, 3], - [4, 5]]) - >>> poly = PolynomialFeatures(2) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0., 0., 1.], - [ 1., 2., 3., 4., 6., 9.], - [ 1., 4., 5., 16., 20., 25.]]) - >>> poly = PolynomialFeatures(interaction_only=True) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0.], - [ 1., 2., 3., 6.], - [ 1., 4., 5., 20.]]) - - Attributes - ---------- - powers_ : ndarray of shape (n_output_features, n_input_features) - powers_[i, j] is the exponent of the jth input in the ith output. - - n_input_features_ : int - The total number of input features. - - n_output_features_ : int - The total number of polynomial output features. The number of output - features is computed by iterating over all suitably sized combinations - of input features. - - See Also - -------- - SplineTransformer : Transformer that generates univariate B-spline bases - for features - - Notes - ----- - Be aware that the number of features in the output array scales - polynomially in the number of features of the input array, and - exponentially in the degree. High degrees can cause overfitting. - - See :ref:`examples/linear_model/plot_polynomial_interpolation.py - ` - """ - @_deprecate_positional_args - def __init__(self, degree=2, *, interaction_only=False, include_bias=True, - order='C'): - self.degree = degree - self.interaction_only = interaction_only - self.include_bias = include_bias - self.order = order - - @staticmethod - def _combinations(n_features, degree, interaction_only, include_bias): - comb = (combinations if interaction_only else combinations_w_r) - start = int(not include_bias) - return chain.from_iterable(comb(range(n_features), i) - for i in range(start, degree + 1)) - - @property - def powers_(self): - check_is_fitted(self) - - combinations = self._combinations(self.n_input_features_, self.degree, - self.interaction_only, - self.include_bias) - return np.vstack([np.bincount(c, minlength=self.n_input_features_) - for c in combinations]) - - def get_feature_names(self, input_features=None): - """ - Return feature names for output features - - Parameters - ---------- - input_features : list of str of shape (n_features,), default=None - String names for input features if available. By default, - "x0", "x1", ... "xn_features" is used. - - Returns - ------- - output_feature_names : list of str of shape (n_output_features,) - """ - powers = self.powers_ - if input_features is None: - input_features = ['x%d' % i for i in range(powers.shape[1])] - feature_names = [] - for row in powers: - inds = np.where(row)[0] - if len(inds): - name = " ".join("%s^%d" % (input_features[ind], exp) - if exp != 1 else input_features[ind] - for ind, exp in zip(inds, row[inds])) - else: - name = "1" - feature_names.append(name) - return feature_names - - def fit(self, X, y=None): - """ - Compute number of output features. - - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data. - - y : None - Ignored. - - Returns - ------- - self : object - Fitted transformer. - """ - n_samples, n_features = self._validate_data( - X, accept_sparse=True).shape - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) - self.n_input_features_ = n_features - self.n_output_features_ = sum(1 for _ in combinations) - return self - - def transform(self, X): - """Transform data to polynomial features. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data to transform, row by row. - - Prefer CSR over CSC for sparse input (for speed), but CSC is - required if the degree is 4 or higher. If the degree is less than - 4 and the input format is CSC, it will be converted to CSR, have - its polynomial features generated, then converted back to CSC. - - If the degree is 2 or 3, the method described in "Leveraging - Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices - Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is - used, which is much faster than the method used on CSC input. For - this reason, a CSC input will be converted to CSR, and the output - will be converted back to CSC prior to being returned, hence the - preference of CSR. - - Returns - ------- - XP : {ndarray, sparse matrix} of shape (n_samples, NP) - The matrix of features, where NP is the number of polynomial - features generated from the combination of inputs. If a sparse - matrix is provided, it will be converted into a sparse - ``csr_matrix``. - """ - check_is_fitted(self) - - X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, - accept_sparse=('csr', 'csc')) - - n_samples, n_features = X.shape - - if n_features != self.n_input_features_: - raise ValueError("X shape does not match training shape") - - if sparse.isspmatrix_csr(X): - if self.degree > 3: - return self.transform(X.tocsc()).tocsr() - to_stack = [] - if self.include_bias: - to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) - to_stack.append(X) - for deg in range(2, self.degree+1): - Xp_next = _csr_polynomial_expansion(X.data, X.indices, - X.indptr, X.shape[1], - self.interaction_only, - deg) - if Xp_next is None: - break - to_stack.append(Xp_next) - XP = sparse.hstack(to_stack, format='csr') - elif sparse.isspmatrix_csc(X) and self.degree < 4: - return self.transform(X.tocsr()).tocsc() - else: - if sparse.isspmatrix(X): - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) - columns = [] - for comb in combinations: - if comb: - out_col = 1 - for col_idx in comb: - out_col = X[:, col_idx].multiply(out_col) - columns.append(out_col) - else: - bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) - columns.append(bias) - XP = sparse.hstack(columns, dtype=X.dtype).tocsc() - else: - XP = np.empty((n_samples, self.n_output_features_), - dtype=X.dtype, order=self.order) - - # What follows is a faster implementation of: - # for i, comb in enumerate(combinations): - # XP[:, i] = X[:, comb].prod(1) - # This implementation uses two optimisations. - # First one is broadcasting, - # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] - # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] - # ... - # multiply ([X[:, start:end], X[:, start]) -> ... - # Second optimisation happens for degrees >= 3. - # Xi^3 is computed reusing previous computation: - # Xi^3 = Xi^2 * Xi. - - if self.include_bias: - XP[:, 0] = 1 - current_col = 1 - else: - current_col = 0 - - # d = 0 - XP[:, current_col:current_col + n_features] = X - index = list(range(current_col, - current_col + n_features)) - current_col += n_features - index.append(current_col) - - # d >= 1 - for _ in range(1, self.degree): - new_index = [] - end = index[-1] - for feature_idx in range(n_features): - start = index[feature_idx] - new_index.append(current_col) - if self.interaction_only: - start += (index[feature_idx + 1] - - index[feature_idx]) - next_col = current_col + end - start - if next_col <= current_col: - break - # XP[:, start:end] are terms of degree d - 1 - # that exclude feature #feature_idx. - np.multiply(XP[:, start:end], - X[:, feature_idx:feature_idx + 1], - out=XP[:, current_col:next_col], - casting='no') - current_col = next_col - - new_index.append(current_col) - index = new_index - - return XP - - @_deprecate_positional_args def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False): """Scale input vectors individually to unit norm (vector length). diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index ad358e50c4681..3f4ccc2fa05d4 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -2,14 +2,19 @@ This file contains preprocessing tools based on polynomials. """ import numbers +from itertools import chain, combinations +from itertools import combinations_with_replacement as combinations_w_r import numpy as np +from scipy import sparse from scipy.interpolate import BSpline from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.fixes import linspace -from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from ..utils.validation import (check_is_fitted, FLOAT_DTYPES, + _deprecate_positional_args) +from ._csr_polynomial_expansion import _csr_polynomial_expansion __all__ = [ @@ -17,6 +22,293 @@ ] +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features. + + Generate a new feature matrix consisting of all polynomial combinations + of the features with degree less than or equal to the specified degree. + For example, if an input sample is two dimensional and of the form + [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. + + Parameters + ---------- + degree : int, default=2 + The degree of the polynomial features. + + interaction_only : bool, default=False + If true, only interaction features are produced: features that are + products of at most ``degree`` *distinct* input features (so not + ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). + + include_bias : bool, default=True + If True (default), then include a bias column, the feature in which + all polynomial powers are zero (i.e. a column of ones - acts as an + intercept term in a linear model). + + order : {'C', 'F'}, default='C' + Order of output array in the dense case. 'F' order is faster to + compute, but may slow down subsequent estimators. + + .. versionadded:: 0.21 + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PolynomialFeatures + >>> X = np.arange(6).reshape(3, 2) + >>> X + array([[0, 1], + [2, 3], + [4, 5]]) + >>> poly = PolynomialFeatures(2) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0., 0., 1.], + [ 1., 2., 3., 4., 6., 9.], + [ 1., 4., 5., 16., 20., 25.]]) + >>> poly = PolynomialFeatures(interaction_only=True) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0.], + [ 1., 2., 3., 6.], + [ 1., 4., 5., 20.]]) + + Attributes + ---------- + powers_ : ndarray of shape (n_output_features, n_input_features) + powers_[i, j] is the exponent of the jth input in the ith output. + + n_input_features_ : int + The total number of input features. + + n_output_features_ : int + The total number of polynomial output features. The number of output + features is computed by iterating over all suitably sized combinations + of input features. + + See Also + -------- + SplineTransformer : Transformer that generates univariate B-spline bases + for features + + Notes + ----- + Be aware that the number of features in the output array scales + polynomially in the number of features of the input array, and + exponentially in the degree. High degrees can cause overfitting. + + See :ref:`examples/linear_model/plot_polynomial_interpolation.py + ` + """ + @_deprecate_positional_args + def __init__(self, degree=2, *, interaction_only=False, include_bias=True, + order='C'): + self.degree = degree + self.interaction_only = interaction_only + self.include_bias = include_bias + self.order = order + + @staticmethod + def _combinations(n_features, degree, interaction_only, include_bias): + comb = (combinations if interaction_only else combinations_w_r) + start = int(not include_bias) + return chain.from_iterable(comb(range(n_features), i) + for i in range(start, degree + 1)) + + @property + def powers_(self): + check_is_fitted(self) + + combinations = self._combinations(self.n_input_features_, self.degree, + self.interaction_only, + self.include_bias) + return np.vstack([np.bincount(c, minlength=self.n_input_features_) + for c in combinations]) + + def get_feature_names(self, input_features=None): + """ + Return feature names for output features + + Parameters + ---------- + input_features : list of str of shape (n_features,), default=None + String names for input features if available. By default, + "x0", "x1", ... "xn_features" is used. + + Returns + ------- + output_feature_names : list of str of shape (n_output_features,) + """ + powers = self.powers_ + if input_features is None: + input_features = ['x%d' % i for i in range(powers.shape[1])] + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join("%s^%d" % (input_features[ind], exp) + if exp != 1 else input_features[ind] + for ind, exp in zip(inds, row[inds])) + else: + name = "1" + feature_names.append(name) + return feature_names + + def fit(self, X, y=None): + """ + Compute number of output features. + + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + n_samples, n_features = self._validate_data( + X, accept_sparse=True).shape + combinations = self._combinations(n_features, self.degree, + self.interaction_only, + self.include_bias) + self.n_input_features_ = n_features + self.n_output_features_ = sum(1 for _ in combinations) + return self + + def transform(self, X): + """Transform data to polynomial features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to transform, row by row. + + Prefer CSR over CSC for sparse input (for speed), but CSC is + required if the degree is 4 or higher. If the degree is less than + 4 and the input format is CSC, it will be converted to CSR, have + its polynomial features generated, then converted back to CSC. + + If the degree is 2 or 3, the method described in "Leveraging + Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices + Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is + used, which is much faster than the method used on CSC input. For + this reason, a CSC input will be converted to CSR, and the output + will be converted back to CSC prior to being returned, hence the + preference of CSR. + + Returns + ------- + XP : {ndarray, sparse matrix} of shape (n_samples, NP) + The matrix of features, where NP is the number of polynomial + features generated from the combination of inputs. If a sparse + matrix is provided, it will be converted into a sparse + ``csr_matrix``. + """ + check_is_fitted(self) + + X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, + accept_sparse=('csr', 'csc')) + + n_samples, n_features = X.shape + + if n_features != self.n_input_features_: + raise ValueError("X shape does not match training shape") + + if sparse.isspmatrix_csr(X): + if self.degree > 3: + return self.transform(X.tocsc()).tocsr() + to_stack = [] + if self.include_bias: + to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) + to_stack.append(X) + for deg in range(2, self.degree+1): + Xp_next = _csr_polynomial_expansion(X.data, X.indices, + X.indptr, X.shape[1], + self.interaction_only, + deg) + if Xp_next is None: + break + to_stack.append(Xp_next) + XP = sparse.hstack(to_stack, format='csr') + elif sparse.isspmatrix_csc(X) and self.degree < 4: + return self.transform(X.tocsr()).tocsc() + else: + if sparse.isspmatrix(X): + combinations = self._combinations(n_features, self.degree, + self.interaction_only, + self.include_bias) + columns = [] + for comb in combinations: + if comb: + out_col = 1 + for col_idx in comb: + out_col = X[:, col_idx].multiply(out_col) + columns.append(out_col) + else: + bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) + columns.append(bias) + XP = sparse.hstack(columns, dtype=X.dtype).tocsc() + else: + XP = np.empty((n_samples, self.n_output_features_), + dtype=X.dtype, order=self.order) + + # What follows is a faster implementation of: + # for i, comb in enumerate(combinations): + # XP[:, i] = X[:, comb].prod(1) + # This implementation uses two optimisations. + # First one is broadcasting, + # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] + # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] + # ... + # multiply ([X[:, start:end], X[:, start]) -> ... + # Second optimisation happens for degrees >= 3. + # Xi^3 is computed reusing previous computation: + # Xi^3 = Xi^2 * Xi. + + if self.include_bias: + XP[:, 0] = 1 + current_col = 1 + else: + current_col = 0 + + # d = 0 + XP[:, current_col:current_col + n_features] = X + index = list(range(current_col, + current_col + n_features)) + current_col += n_features + index.append(current_col) + + # d >= 1 + for _ in range(1, self.degree): + new_index = [] + end = index[-1] + for feature_idx in range(n_features): + start = index[feature_idx] + new_index.append(current_col) + if self.interaction_only: + start += (index[feature_idx + 1] - + index[feature_idx]) + next_col = current_col + end - start + if next_col <= current_col: + break + # XP[:, start:end] are terms of degree d - 1 + # that exclude feature #feature_idx. + np.multiply(XP[:, start:end], + X[:, feature_idx:feature_idx + 1], + out=XP[:, current_col:next_col], + casting='no') + current_col = next_col + + new_index.append(current_col) + index = new_index + + return XP + + # TODO: # - sparse support (either scipy or own cython solution)? class SplineTransformer(TransformerMixin, BaseEstimator): diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index fdd88be0ccff4..196060388ddd2 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -10,7 +10,6 @@ import numpy as np import numpy.linalg as la from scipy import sparse, stats -from scipy.sparse import random as sparse_random import pytest @@ -43,7 +42,6 @@ from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import robust_scale from sklearn.preprocessing import add_dummy_feature -from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import PowerTransformer from sklearn.preprocessing import power_transform from sklearn.preprocessing._data import _handle_zeros_in_scale @@ -94,203 +92,6 @@ def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen) -def test_polynomial_features(): - # Test Polynomial Features - X1 = np.arange(6)[:, np.newaxis] - P1 = np.hstack([np.ones_like(X1), - X1, X1 ** 2, X1 ** 3]) - deg1 = 3 - - X2 = np.arange(6).reshape((3, 2)) - x1 = X2[:, :1] - x2 = X2[:, 1:] - P2 = np.hstack([x1 ** 0 * x2 ** 0, - x1 ** 1 * x2 ** 0, - x1 ** 0 * x2 ** 1, - x1 ** 2 * x2 ** 0, - x1 ** 1 * x2 ** 1, - x1 ** 0 * x2 ** 2]) - deg2 = 2 - - for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: - P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) - assert_array_almost_equal(P_test, P) - - P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) - assert_array_almost_equal(P_test, P[:, 1:]) - - interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) - X_poly = interact.fit_transform(X) - assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) - - assert interact.powers_.shape == (interact.n_output_features_, - interact.n_input_features_) - - -def test_polynomial_feature_names(): - X = np.arange(30).reshape(10, 3) - poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) - feature_names = poly.get_feature_names() - assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', - 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], - feature_names) - - poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) - feature_names = poly.get_feature_names(["a", "b", "c"]) - assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', - 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', - 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', - 'b c^2', 'c^3'], feature_names) - # test some unicode - poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) - feature_names = poly.get_feature_names( - ["\u0001F40D", "\u262E", "\u05D0"]) - assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], - feature_names) - - -def test_polynomial_feature_array_order(): - """Test that output array has the given order.""" - X = np.arange(10).reshape(5, 2) - - def is_c_contiguous(a): - return np.isfortran(a.T) - - assert is_c_contiguous(PolynomialFeatures().fit_transform(X)) - assert is_c_contiguous(PolynomialFeatures(order='C').fit_transform(X)) - assert np.isfortran(PolynomialFeatures(order='F').fit_transform(X)) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(1, True, False, int), - (2, True, False, int), - (2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64), - (4, False, False, np.float64), - (4, False, True, np.float64)]) -def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype): - rng = np.random.RandomState(0) - X = rng.randint(0, 2, (100, 2)) - X_csc = sparse.csc_matrix(X) - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csc = est.fit_transform(X_csc.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype)) - - assert isinstance(Xt_csc, sparse.csc_matrix) - assert Xt_csc.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csc.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(1, True, False, int), - (2, True, False, int), - (2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64)]) -def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): - rng = np.random.RandomState(0) - X = rng.randint(0, 2, (100, 2)) - X_csr = sparse.csr_matrix(X) - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64)]) -def test_polynomial_features_csr_X_floats(deg, include_bias, - interaction_only, dtype): - X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype)) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'], - [(0, 2, True), (1, 2, True), (2, 2, True), - (0, 3, True), (1, 3, True), (2, 3, True), - (0, 2, False), (1, 2, False), (2, 2, False), - (0, 3, False), (1, 3, False), (2, 3, False)]) -def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, - interaction_only): - X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr() - X_csr[zero_row_index, :] = 0.0 - X = X_csr.toarray() - - est = PolynomialFeatures(deg, include_bias=False, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -# This degree should always be one more than the highest degree supported by -# _csr_expansion. -@pytest.mark.parametrize(['include_bias', 'interaction_only'], - [(True, True), (True, False), - (False, True), (False, False)]) -def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only): - X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(4, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'dim', 'interaction_only'], - [(2, 1, True), - (2, 2, True), - (3, 1, True), - (3, 2, True), - (3, 3, True), - (2, 1, False), - (2, 2, False), - (3, 1, False), - (3, 2, False), - (3, 3, False)]) -def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only): - X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(deg, interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index b1908bf9fe12a..5068a8c7d8bdd 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -1,17 +1,22 @@ import numpy as np import pytest +from scipy import sparse +from scipy.sparse import random as sparse_random +from sklearn.utils._testing import assert_array_almost_equal + from numpy.testing import assert_allclose, assert_array_equal from scipy.interpolate import BSpline from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline -from sklearn.preprocessing import KBinsDiscretizer, SplineTransformer +from sklearn.preprocessing import ( + KBinsDiscretizer, PolynomialFeatures, SplineTransformer +) from sklearn.utils.fixes import linspace, sp_version from pkg_resources import parse_version -# TODO: add PolynomialFeatures if it moves to _polynomial.py -@pytest.mark.parametrize("est", (SplineTransformer,)) +@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer)) def test_polynomial_and_spline_array_order(est): """Test that output array has the given order.""" X = np.arange(10).reshape(5, 2) @@ -444,3 +449,188 @@ def test_spline_transformer_n_features_out(n_knots, include_bias, degree): splt.fit(X) assert splt.transform(X).shape[1] == splt.n_features_out_ + + +def test_polynomial_features(): + # Test Polynomial Features + X1 = np.arange(6)[:, np.newaxis] + P1 = np.hstack([np.ones_like(X1), + X1, X1 ** 2, X1 ** 3]) + deg1 = 3 + + X2 = np.arange(6).reshape((3, 2)) + x1 = X2[:, :1] + x2 = X2[:, 1:] + P2 = np.hstack([x1 ** 0 * x2 ** 0, + x1 ** 1 * x2 ** 0, + x1 ** 0 * x2 ** 1, + x1 ** 2 * x2 ** 0, + x1 ** 1 * x2 ** 1, + x1 ** 0 * x2 ** 2]) + deg2 = 2 + + for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: + P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) + assert_array_almost_equal(P_test, P) + + P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) + assert_array_almost_equal(P_test, P[:, 1:]) + + interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) + X_poly = interact.fit_transform(X) + assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) + + assert interact.powers_.shape == (interact.n_output_features_, + interact.n_input_features_) + + +def test_polynomial_feature_names(): + X = np.arange(30).reshape(10, 3) + poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) + feature_names = poly.get_feature_names() + assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', + 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], + feature_names) + + poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) + feature_names = poly.get_feature_names(["a", "b", "c"]) + assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', + 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', + 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', + 'b c^2', 'c^3'], feature_names) + # test some unicode + poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) + feature_names = poly.get_feature_names( + ["\u0001F40D", "\u262E", "\u05D0"]) + assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], + feature_names) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + (4, False, False, np.float64), + (4, False, True, np.float64)]) +def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype): + rng = np.random.RandomState(0) + X = rng.randint(0, 2, (100, 2)) + X_csc = sparse.csc_matrix(X) + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csc = est.fit_transform(X_csc.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert isinstance(Xt_csc, sparse.csc_matrix) + assert Xt_csc.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csc.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64)]) +def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): + rng = np.random.RandomState(0) + X = rng.randint(0, 2, (100, 2)) + X_csr = sparse.csr_matrix(X) + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64)]) +def test_polynomial_features_csr_X_floats(deg, include_bias, + interaction_only, dtype): + X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'], + [(0, 2, True), (1, 2, True), (2, 2, True), + (0, 3, True), (1, 3, True), (2, 3, True), + (0, 2, False), (1, 2, False), (2, 2, False), + (0, 3, False), (1, 3, False), (2, 3, False)]) +def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, + interaction_only): + X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr() + X_csr[zero_row_index, :] = 0.0 + X = X_csr.toarray() + + est = PolynomialFeatures(deg, include_bias=False, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +# This degree should always be one more than the highest degree supported by +# _csr_expansion. +@pytest.mark.parametrize(['include_bias', 'interaction_only'], + [(True, True), (True, False), + (False, True), (False, False)]) +def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only): + X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(4, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'dim', 'interaction_only'], + [(2, 1, True), + (2, 2, True), + (3, 1, True), + (3, 2, True), + (3, 3, True), + (2, 1, False), + (2, 2, False), + (3, 1, False), + (3, 2, False), + (3, 3, False)]) +def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only): + X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(deg, interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense)