From 5645b2487cdaf7eefb530c2898a93d2248c99c1f Mon Sep 17 00:00:00 2001 From: avigupta2612 Date: Thu, 4 Mar 2021 04:40:09 +0530 Subject: [PATCH 1/6] moved Polynomialfeatures to polynomial --- sklearn/preprocessing/__init__.py | 2 +- sklearn/preprocessing/_data.py | 289 ----------------- sklearn/preprocessing/_polynomial.py | 294 +++++++++++++++++- sklearn/preprocessing/tests/test_data.py | 199 ------------ .../preprocessing/tests/test_polynomial.py | 202 ++++++++++++ 5 files changed, 496 insertions(+), 490 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index 076b9e85e1150..a24346dc8d045 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -23,7 +23,6 @@ from ._data import quantile_transform from ._data import power_transform from ._data import PowerTransformer -from ._data import PolynomialFeatures from ._encoders import OneHotEncoder from ._encoders import OrdinalEncoder @@ -36,6 +35,7 @@ from ._discretization import KBinsDiscretizer from ._polynomial import SplineTransformer +from ._polynomial import PolynomialFeatures __all__ = [ diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index 29190dd6e2b67..e45f6e869f45a 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -8,9 +8,7 @@ # License: BSD 3 clause -from itertools import chain, combinations import warnings -from itertools import combinations_with_replacement as combinations_w_r import numpy as np from scipy import sparse @@ -1570,293 +1568,6 @@ def robust_scale(X, *, axis=0, with_centering=True, with_scaling=True, return X -class PolynomialFeatures(TransformerMixin, BaseEstimator): - """Generate polynomial and interaction features. - - Generate a new feature matrix consisting of all polynomial combinations - of the features with degree less than or equal to the specified degree. - For example, if an input sample is two dimensional and of the form - [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. - - Parameters - ---------- - degree : int, default=2 - The degree of the polynomial features. - - interaction_only : bool, default=False - If true, only interaction features are produced: features that are - products of at most ``degree`` *distinct* input features (so not - ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). - - include_bias : bool, default=True - If True (default), then include a bias column, the feature in which - all polynomial powers are zero (i.e. a column of ones - acts as an - intercept term in a linear model). - - order : {'C', 'F'}, default='C' - Order of output array in the dense case. 'F' order is faster to - compute, but may slow down subsequent estimators. - - .. versionadded:: 0.21 - - Examples - -------- - >>> import numpy as np - >>> from sklearn.preprocessing import PolynomialFeatures - >>> X = np.arange(6).reshape(3, 2) - >>> X - array([[0, 1], - [2, 3], - [4, 5]]) - >>> poly = PolynomialFeatures(2) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0., 0., 1.], - [ 1., 2., 3., 4., 6., 9.], - [ 1., 4., 5., 16., 20., 25.]]) - >>> poly = PolynomialFeatures(interaction_only=True) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0.], - [ 1., 2., 3., 6.], - [ 1., 4., 5., 20.]]) - - Attributes - ---------- - powers_ : ndarray of shape (n_output_features, n_input_features) - powers_[i, j] is the exponent of the jth input in the ith output. - - n_input_features_ : int - The total number of input features. - - n_output_features_ : int - The total number of polynomial output features. The number of output - features is computed by iterating over all suitably sized combinations - of input features. - - See Also - -------- - SplineTransformer : Transformer that generates univariate B-spline bases - for features - - Notes - ----- - Be aware that the number of features in the output array scales - polynomially in the number of features of the input array, and - exponentially in the degree. High degrees can cause overfitting. - - See :ref:`examples/linear_model/plot_polynomial_interpolation.py - ` - """ - @_deprecate_positional_args - def __init__(self, degree=2, *, interaction_only=False, include_bias=True, - order='C'): - self.degree = degree - self.interaction_only = interaction_only - self.include_bias = include_bias - self.order = order - - @staticmethod - def _combinations(n_features, degree, interaction_only, include_bias): - comb = (combinations if interaction_only else combinations_w_r) - start = int(not include_bias) - return chain.from_iterable(comb(range(n_features), i) - for i in range(start, degree + 1)) - - @property - def powers_(self): - check_is_fitted(self) - - combinations = self._combinations(self.n_input_features_, self.degree, - self.interaction_only, - self.include_bias) - return np.vstack([np.bincount(c, minlength=self.n_input_features_) - for c in combinations]) - - def get_feature_names(self, input_features=None): - """ - Return feature names for output features - - Parameters - ---------- - input_features : list of str of shape (n_features,), default=None - String names for input features if available. By default, - "x0", "x1", ... "xn_features" is used. - - Returns - ------- - output_feature_names : list of str of shape (n_output_features,) - """ - powers = self.powers_ - if input_features is None: - input_features = ['x%d' % i for i in range(powers.shape[1])] - feature_names = [] - for row in powers: - inds = np.where(row)[0] - if len(inds): - name = " ".join("%s^%d" % (input_features[ind], exp) - if exp != 1 else input_features[ind] - for ind, exp in zip(inds, row[inds])) - else: - name = "1" - feature_names.append(name) - return feature_names - - def fit(self, X, y=None): - """ - Compute number of output features. - - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data. - - y : None - Ignored. - - Returns - ------- - self : object - Fitted transformer. - """ - n_samples, n_features = self._validate_data( - X, accept_sparse=True).shape - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) - self.n_input_features_ = n_features - self.n_output_features_ = sum(1 for _ in combinations) - return self - - def transform(self, X): - """Transform data to polynomial features. - - Parameters - ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data to transform, row by row. - - Prefer CSR over CSC for sparse input (for speed), but CSC is - required if the degree is 4 or higher. If the degree is less than - 4 and the input format is CSC, it will be converted to CSR, have - its polynomial features generated, then converted back to CSC. - - If the degree is 2 or 3, the method described in "Leveraging - Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices - Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is - used, which is much faster than the method used on CSC input. For - this reason, a CSC input will be converted to CSR, and the output - will be converted back to CSC prior to being returned, hence the - preference of CSR. - - Returns - ------- - XP : {ndarray, sparse matrix} of shape (n_samples, NP) - The matrix of features, where NP is the number of polynomial - features generated from the combination of inputs. If a sparse - matrix is provided, it will be converted into a sparse - ``csr_matrix``. - """ - check_is_fitted(self) - - X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, - accept_sparse=('csr', 'csc')) - - n_samples, n_features = X.shape - - if n_features != self.n_input_features_: - raise ValueError("X shape does not match training shape") - - if sparse.isspmatrix_csr(X): - if self.degree > 3: - return self.transform(X.tocsc()).tocsr() - to_stack = [] - if self.include_bias: - to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) - to_stack.append(X) - for deg in range(2, self.degree+1): - Xp_next = _csr_polynomial_expansion(X.data, X.indices, - X.indptr, X.shape[1], - self.interaction_only, - deg) - if Xp_next is None: - break - to_stack.append(Xp_next) - XP = sparse.hstack(to_stack, format='csr') - elif sparse.isspmatrix_csc(X) and self.degree < 4: - return self.transform(X.tocsr()).tocsc() - else: - if sparse.isspmatrix(X): - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) - columns = [] - for comb in combinations: - if comb: - out_col = 1 - for col_idx in comb: - out_col = X[:, col_idx].multiply(out_col) - columns.append(out_col) - else: - bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) - columns.append(bias) - XP = sparse.hstack(columns, dtype=X.dtype).tocsc() - else: - XP = np.empty((n_samples, self.n_output_features_), - dtype=X.dtype, order=self.order) - - # What follows is a faster implementation of: - # for i, comb in enumerate(combinations): - # XP[:, i] = X[:, comb].prod(1) - # This implementation uses two optimisations. - # First one is broadcasting, - # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] - # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] - # ... - # multiply ([X[:, start:end], X[:, start]) -> ... - # Second optimisation happens for degrees >= 3. - # Xi^3 is computed reusing previous computation: - # Xi^3 = Xi^2 * Xi. - - if self.include_bias: - XP[:, 0] = 1 - current_col = 1 - else: - current_col = 0 - - # d = 0 - XP[:, current_col:current_col + n_features] = X - index = list(range(current_col, - current_col + n_features)) - current_col += n_features - index.append(current_col) - - # d >= 1 - for _ in range(1, self.degree): - new_index = [] - end = index[-1] - for feature_idx in range(n_features): - start = index[feature_idx] - new_index.append(current_col) - if self.interaction_only: - start += (index[feature_idx + 1] - - index[feature_idx]) - next_col = current_col + end - start - if next_col <= current_col: - break - # XP[:, start:end] are terms of degree d - 1 - # that exclude feature #feature_idx. - np.multiply(XP[:, start:end], - X[:, feature_idx:feature_idx + 1], - out=XP[:, current_col:next_col], - casting='no') - current_col = next_col - - new_index.append(current_col) - index = new_index - - return XP - - @_deprecate_positional_args def normalize(X, norm='l2', *, axis=1, copy=True, return_norm=False): """Scale input vectors individually to unit norm (vector length). diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 26587e7f05823..73a0af963ad33 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -2,14 +2,19 @@ This file contains preprocessing tools based on polynomials. """ import numbers +from itertools import chain, combinations +from itertools import combinations_with_replacement as combinations_w_r import numpy as np +from scipy import sparse from scipy.interpolate import BSpline from ..base import BaseEstimator, TransformerMixin from ..utils import check_array from ..utils.fixes import linspace -from ..utils.validation import check_is_fitted, FLOAT_DTYPES +from ..utils.validation import (check_is_fitted, FLOAT_DTYPES, + _deprecate_positional_args) +from ._csr_polynomial_expansion import _csr_polynomial_expansion __all__ = [ @@ -427,3 +432,290 @@ def transform(self, X): j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0 ] return XBS[:, indices] + + +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features. + + Generate a new feature matrix consisting of all polynomial combinations + of the features with degree less than or equal to the specified degree. + For example, if an input sample is two dimensional and of the form + [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. + + Parameters + ---------- + degree : int, default=2 + The degree of the polynomial features. + + interaction_only : bool, default=False + If true, only interaction features are produced: features that are + products of at most ``degree`` *distinct* input features (so not + ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). + + include_bias : bool, default=True + If True (default), then include a bias column, the feature in which + all polynomial powers are zero (i.e. a column of ones - acts as an + intercept term in a linear model). + + order : {'C', 'F'}, default='C' + Order of output array in the dense case. 'F' order is faster to + compute, but may slow down subsequent estimators. + + .. versionadded:: 0.21 + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PolynomialFeatures + >>> X = np.arange(6).reshape(3, 2) + >>> X + array([[0, 1], + [2, 3], + [4, 5]]) + >>> poly = PolynomialFeatures(2) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0., 0., 1.], + [ 1., 2., 3., 4., 6., 9.], + [ 1., 4., 5., 16., 20., 25.]]) + >>> poly = PolynomialFeatures(interaction_only=True) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0.], + [ 1., 2., 3., 6.], + [ 1., 4., 5., 20.]]) + + Attributes + ---------- + powers_ : ndarray of shape (n_output_features, n_input_features) + powers_[i, j] is the exponent of the jth input in the ith output. + + n_input_features_ : int + The total number of input features. + + n_output_features_ : int + The total number of polynomial output features. The number of output + features is computed by iterating over all suitably sized combinations + of input features. + + See Also + -------- + SplineTransformer : Transformer that generates univariate B-spline bases + for features + + Notes + ----- + Be aware that the number of features in the output array scales + polynomially in the number of features of the input array, and + exponentially in the degree. High degrees can cause overfitting. + + See :ref:`examples/linear_model/plot_polynomial_interpolation.py + ` + """ + @_deprecate_positional_args + def __init__(self, degree=2, *, interaction_only=False, include_bias=True, + order='C'): + self.degree = degree + self.interaction_only = interaction_only + self.include_bias = include_bias + self.order = order + + @staticmethod + def _combinations(n_features, degree, interaction_only, include_bias): + comb = (combinations if interaction_only else combinations_w_r) + start = int(not include_bias) + return chain.from_iterable(comb(range(n_features), i) + for i in range(start, degree + 1)) + + @property + def powers_(self): + check_is_fitted(self) + + combinations = self._combinations(self.n_input_features_, self.degree, + self.interaction_only, + self.include_bias) + return np.vstack([np.bincount(c, minlength=self.n_input_features_) + for c in combinations]) + + def get_feature_names(self, input_features=None): + """ + Return feature names for output features + + Parameters + ---------- + input_features : list of str of shape (n_features,), default=None + String names for input features if available. By default, + "x0", "x1", ... "xn_features" is used. + + Returns + ------- + output_feature_names : list of str of shape (n_output_features,) + """ + powers = self.powers_ + if input_features is None: + input_features = ['x%d' % i for i in range(powers.shape[1])] + feature_names = [] + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join("%s^%d" % (input_features[ind], exp) + if exp != 1 else input_features[ind] + for ind, exp in zip(inds, row[inds])) + else: + name = "1" + feature_names.append(name) + return feature_names + + def fit(self, X, y=None): + """ + Compute number of output features. + + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data. + + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + n_samples, n_features = self._validate_data( + X, accept_sparse=True).shape + combinations = self._combinations(n_features, self.degree, + self.interaction_only, + self.include_bias) + self.n_input_features_ = n_features + self.n_output_features_ = sum(1 for _ in combinations) + return self + + def transform(self, X): + """Transform data to polynomial features. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to transform, row by row. + + Prefer CSR over CSC for sparse input (for speed), but CSC is + required if the degree is 4 or higher. If the degree is less than + 4 and the input format is CSC, it will be converted to CSR, have + its polynomial features generated, then converted back to CSC. + + If the degree is 2 or 3, the method described in "Leveraging + Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices + Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is + used, which is much faster than the method used on CSC input. For + this reason, a CSC input will be converted to CSR, and the output + will be converted back to CSC prior to being returned, hence the + preference of CSR. + + Returns + ------- + XP : {ndarray, sparse matrix} of shape (n_samples, NP) + The matrix of features, where NP is the number of polynomial + features generated from the combination of inputs. If a sparse + matrix is provided, it will be converted into a sparse + ``csr_matrix``. + """ + check_is_fitted(self) + + X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, + accept_sparse=('csr', 'csc')) + + n_samples, n_features = X.shape + + if n_features != self.n_input_features_: + raise ValueError("X shape does not match training shape") + + if sparse.isspmatrix_csr(X): + if self.degree > 3: + return self.transform(X.tocsc()).tocsr() + to_stack = [] + if self.include_bias: + to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) + to_stack.append(X) + for deg in range(2, self.degree+1): + Xp_next = _csr_polynomial_expansion(X.data, X.indices, + X.indptr, X.shape[1], + self.interaction_only, + deg) + if Xp_next is None: + break + to_stack.append(Xp_next) + XP = sparse.hstack(to_stack, format='csr') + elif sparse.isspmatrix_csc(X) and self.degree < 4: + return self.transform(X.tocsr()).tocsc() + else: + if sparse.isspmatrix(X): + combinations = self._combinations(n_features, self.degree, + self.interaction_only, + self.include_bias) + columns = [] + for comb in combinations: + if comb: + out_col = 1 + for col_idx in comb: + out_col = X[:, col_idx].multiply(out_col) + columns.append(out_col) + else: + bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) + columns.append(bias) + XP = sparse.hstack(columns, dtype=X.dtype).tocsc() + else: + XP = np.empty((n_samples, self.n_output_features_), + dtype=X.dtype, order=self.order) + + # What follows is a faster implementation of: + # for i, comb in enumerate(combinations): + # XP[:, i] = X[:, comb].prod(1) + # This implementation uses two optimisations. + # First one is broadcasting, + # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] + # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] + # ... + # multiply ([X[:, start:end], X[:, start]) -> ... + # Second optimisation happens for degrees >= 3. + # Xi^3 is computed reusing previous computation: + # Xi^3 = Xi^2 * Xi. + + if self.include_bias: + XP[:, 0] = 1 + current_col = 1 + else: + current_col = 0 + + # d = 0 + XP[:, current_col:current_col + n_features] = X + index = list(range(current_col, + current_col + n_features)) + current_col += n_features + index.append(current_col) + + # d >= 1 + for _ in range(1, self.degree): + new_index = [] + end = index[-1] + for feature_idx in range(n_features): + start = index[feature_idx] + new_index.append(current_col) + if self.interaction_only: + start += (index[feature_idx + 1] - + index[feature_idx]) + next_col = current_col + end - start + if next_col <= current_col: + break + # XP[:, start:end] are terms of degree d - 1 + # that exclude feature #feature_idx. + np.multiply(XP[:, start:end], + X[:, feature_idx:feature_idx + 1], + out=XP[:, current_col:next_col], + casting='no') + current_col = next_col + + new_index.append(current_col) + index = new_index + + return XP diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index fdd88be0ccff4..196060388ddd2 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -10,7 +10,6 @@ import numpy as np import numpy.linalg as la from scipy import sparse, stats -from scipy.sparse import random as sparse_random import pytest @@ -43,7 +42,6 @@ from sklearn.preprocessing import RobustScaler from sklearn.preprocessing import robust_scale from sklearn.preprocessing import add_dummy_feature -from sklearn.preprocessing import PolynomialFeatures from sklearn.preprocessing import PowerTransformer from sklearn.preprocessing import power_transform from sklearn.preprocessing._data import _handle_zeros_in_scale @@ -94,203 +92,6 @@ def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen) -def test_polynomial_features(): - # Test Polynomial Features - X1 = np.arange(6)[:, np.newaxis] - P1 = np.hstack([np.ones_like(X1), - X1, X1 ** 2, X1 ** 3]) - deg1 = 3 - - X2 = np.arange(6).reshape((3, 2)) - x1 = X2[:, :1] - x2 = X2[:, 1:] - P2 = np.hstack([x1 ** 0 * x2 ** 0, - x1 ** 1 * x2 ** 0, - x1 ** 0 * x2 ** 1, - x1 ** 2 * x2 ** 0, - x1 ** 1 * x2 ** 1, - x1 ** 0 * x2 ** 2]) - deg2 = 2 - - for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: - P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) - assert_array_almost_equal(P_test, P) - - P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) - assert_array_almost_equal(P_test, P[:, 1:]) - - interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) - X_poly = interact.fit_transform(X) - assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) - - assert interact.powers_.shape == (interact.n_output_features_, - interact.n_input_features_) - - -def test_polynomial_feature_names(): - X = np.arange(30).reshape(10, 3) - poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) - feature_names = poly.get_feature_names() - assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', - 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], - feature_names) - - poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) - feature_names = poly.get_feature_names(["a", "b", "c"]) - assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', - 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', - 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', - 'b c^2', 'c^3'], feature_names) - # test some unicode - poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) - feature_names = poly.get_feature_names( - ["\u0001F40D", "\u262E", "\u05D0"]) - assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], - feature_names) - - -def test_polynomial_feature_array_order(): - """Test that output array has the given order.""" - X = np.arange(10).reshape(5, 2) - - def is_c_contiguous(a): - return np.isfortran(a.T) - - assert is_c_contiguous(PolynomialFeatures().fit_transform(X)) - assert is_c_contiguous(PolynomialFeatures(order='C').fit_transform(X)) - assert np.isfortran(PolynomialFeatures(order='F').fit_transform(X)) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(1, True, False, int), - (2, True, False, int), - (2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64), - (4, False, False, np.float64), - (4, False, True, np.float64)]) -def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype): - rng = np.random.RandomState(0) - X = rng.randint(0, 2, (100, 2)) - X_csc = sparse.csc_matrix(X) - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csc = est.fit_transform(X_csc.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype)) - - assert isinstance(Xt_csc, sparse.csc_matrix) - assert Xt_csc.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csc.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(1, True, False, int), - (2, True, False, int), - (2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64)]) -def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): - rng = np.random.RandomState(0) - X = rng.randint(0, 2, (100, 2)) - X_csr = sparse.csr_matrix(X) - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], - [(2, True, False, np.float32), - (2, True, False, np.float64), - (3, False, False, np.float64), - (3, False, True, np.float64)]) -def test_polynomial_features_csr_X_floats(deg, include_bias, - interaction_only, dtype): - X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(deg, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr.astype(dtype)) - Xt_dense = est.fit_transform(X.astype(dtype)) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'], - [(0, 2, True), (1, 2, True), (2, 2, True), - (0, 3, True), (1, 3, True), (2, 3, True), - (0, 2, False), (1, 2, False), (2, 2, False), - (0, 3, False), (1, 3, False), (2, 3, False)]) -def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, - interaction_only): - X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr() - X_csr[zero_row_index, :] = 0.0 - X = X_csr.toarray() - - est = PolynomialFeatures(deg, include_bias=False, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -# This degree should always be one more than the highest degree supported by -# _csr_expansion. -@pytest.mark.parametrize(['include_bias', 'interaction_only'], - [(True, True), (True, False), - (False, True), (False, False)]) -def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only): - X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(4, include_bias=include_bias, - interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - -@pytest.mark.parametrize(['deg', 'dim', 'interaction_only'], - [(2, 1, True), - (2, 2, True), - (3, 1, True), - (3, 2, True), - (3, 3, True), - (2, 1, False), - (2, 2, False), - (3, 1, False), - (3, 2, False), - (3, 3, False)]) -def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only): - X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr() - X = X_csr.toarray() - - est = PolynomialFeatures(deg, interaction_only=interaction_only) - Xt_csr = est.fit_transform(X_csr) - Xt_dense = est.fit_transform(X) - - assert isinstance(Xt_csr, sparse.csr_matrix) - assert Xt_csr.dtype == Xt_dense.dtype - assert_array_almost_equal(Xt_csr.A, Xt_dense) - - def test_raises_value_error_if_sample_weights_greater_than_1d(): # Sample weights must be either scalar or 1D diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 2ca3260f7c05e..53d4c74005f07 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -1,10 +1,15 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest +from scipy import sparse +from scipy.sparse import random as sparse_random + +from sklearn.utils._testing import assert_array_almost_equal from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import KBinsDiscretizer, SplineTransformer +from sklearn.preprocessing import PolynomialFeatures # TODO: add PolynomialFeatures if it moves to _polynomial.py @@ -259,3 +264,200 @@ def test_spline_transformer_n_features_out(n_knots, include_bias, degree): splt.fit(X) assert splt.transform(X).shape[1] == splt.n_features_out_ + + +def test_polynomial_features(): + # Test Polynomial Features + X1 = np.arange(6)[:, np.newaxis] + P1 = np.hstack([np.ones_like(X1), + X1, X1 ** 2, X1 ** 3]) + deg1 = 3 + + X2 = np.arange(6).reshape((3, 2)) + x1 = X2[:, :1] + x2 = X2[:, 1:] + P2 = np.hstack([x1 ** 0 * x2 ** 0, + x1 ** 1 * x2 ** 0, + x1 ** 0 * x2 ** 1, + x1 ** 2 * x2 ** 0, + x1 ** 1 * x2 ** 1, + x1 ** 0 * x2 ** 2]) + deg2 = 2 + + for (deg, X, P) in [(deg1, X1, P1), (deg2, X2, P2)]: + P_test = PolynomialFeatures(deg, include_bias=True).fit_transform(X) + assert_array_almost_equal(P_test, P) + + P_test = PolynomialFeatures(deg, include_bias=False).fit_transform(X) + assert_array_almost_equal(P_test, P[:, 1:]) + + interact = PolynomialFeatures(2, interaction_only=True, include_bias=True) + X_poly = interact.fit_transform(X) + assert_array_almost_equal(X_poly, P2[:, [0, 1, 2, 4]]) + + assert interact.powers_.shape == (interact.n_output_features_, + interact.n_input_features_) + + +def test_polynomial_feature_names(): + X = np.arange(30).reshape(10, 3) + poly = PolynomialFeatures(degree=2, include_bias=True).fit(X) + feature_names = poly.get_feature_names() + assert_array_equal(['1', 'x0', 'x1', 'x2', 'x0^2', 'x0 x1', + 'x0 x2', 'x1^2', 'x1 x2', 'x2^2'], + feature_names) + + poly = PolynomialFeatures(degree=3, include_bias=False).fit(X) + feature_names = poly.get_feature_names(["a", "b", "c"]) + assert_array_equal(['a', 'b', 'c', 'a^2', 'a b', 'a c', 'b^2', + 'b c', 'c^2', 'a^3', 'a^2 b', 'a^2 c', + 'a b^2', 'a b c', 'a c^2', 'b^3', 'b^2 c', + 'b c^2', 'c^3'], feature_names) + # test some unicode + poly = PolynomialFeatures(degree=1, include_bias=True).fit(X) + feature_names = poly.get_feature_names( + ["\u0001F40D", "\u262E", "\u05D0"]) + assert_array_equal(["1", "\u0001F40D", "\u262E", "\u05D0"], + feature_names) + + +def test_polynomial_feature_array_order(): + """Test that output array has the given order.""" + X = np.arange(10).reshape(5, 2) + + def is_c_contiguous(a): + return np.isfortran(a.T) + + assert is_c_contiguous(PolynomialFeatures().fit_transform(X)) + assert is_c_contiguous(PolynomialFeatures(order='C').fit_transform(X)) + assert np.isfortran(PolynomialFeatures(order='F').fit_transform(X)) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64), + (4, False, False, np.float64), + (4, False, True, np.float64)]) +def test_polynomial_features_csc_X(deg, include_bias, interaction_only, dtype): + rng = np.random.RandomState(0) + X = rng.randint(0, 2, (100, 2)) + X_csc = sparse.csc_matrix(X) + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csc = est.fit_transform(X_csc.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert isinstance(Xt_csc, sparse.csc_matrix) + assert Xt_csc.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csc.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(1, True, False, int), + (2, True, False, int), + (2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64)]) +def test_polynomial_features_csr_X(deg, include_bias, interaction_only, dtype): + rng = np.random.RandomState(0) + X = rng.randint(0, 2, (100, 2)) + X_csr = sparse.csr_matrix(X) + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype, copy=False)) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], + [(2, True, False, np.float32), + (2, True, False, np.float64), + (3, False, False, np.float64), + (3, False, True, np.float64)]) +def test_polynomial_features_csr_X_floats(deg, include_bias, + interaction_only, dtype): + X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(deg, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr.astype(dtype)) + Xt_dense = est.fit_transform(X.astype(dtype)) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['zero_row_index', 'deg', 'interaction_only'], + [(0, 2, True), (1, 2, True), (2, 2, True), + (0, 3, True), (1, 3, True), (2, 3, True), + (0, 2, False), (1, 2, False), (2, 2, False), + (0, 3, False), (1, 3, False), (2, 3, False)]) +def test_polynomial_features_csr_X_zero_row(zero_row_index, deg, + interaction_only): + X_csr = sparse_random(3, 10, 1.0, random_state=0).tocsr() + X_csr[zero_row_index, :] = 0.0 + X = X_csr.toarray() + + est = PolynomialFeatures(deg, include_bias=False, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +# This degree should always be one more than the highest degree supported by +# _csr_expansion. +@pytest.mark.parametrize(['include_bias', 'interaction_only'], + [(True, True), (True, False), + (False, True), (False, False)]) +def test_polynomial_features_csr_X_degree_4(include_bias, interaction_only): + X_csr = sparse_random(1000, 10, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(4, include_bias=include_bias, + interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) + + +@pytest.mark.parametrize(['deg', 'dim', 'interaction_only'], + [(2, 1, True), + (2, 2, True), + (3, 1, True), + (3, 2, True), + (3, 3, True), + (2, 1, False), + (2, 2, False), + (3, 1, False), + (3, 2, False), + (3, 3, False)]) +def test_polynomial_features_csr_X_dim_edges(deg, dim, interaction_only): + X_csr = sparse_random(1000, dim, 0.5, random_state=0).tocsr() + X = X_csr.toarray() + + est = PolynomialFeatures(deg, interaction_only=interaction_only) + Xt_csr = est.fit_transform(X_csr) + Xt_dense = est.fit_transform(X) + + assert isinstance(Xt_csr, sparse.csr_matrix) + assert Xt_csr.dtype == Xt_dense.dtype + assert_array_almost_equal(Xt_csr.A, Xt_dense) From 300a3fd6b587194cd96bb5ce9eadb5c815ceb31f Mon Sep 17 00:00:00 2001 From: avigupta2612 Date: Thu, 4 Mar 2021 05:21:01 +0530 Subject: [PATCH 2/6] lint fix --- sklearn/preprocessing/_data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py index e45f6e869f45a..5e85b932a1e39 100644 --- a/sklearn/preprocessing/_data.py +++ b/sklearn/preprocessing/_data.py @@ -29,7 +29,6 @@ from ..utils.validation import (check_is_fitted, check_random_state, _check_sample_weight, FLOAT_DTYPES, _deprecate_positional_args) -from ._csr_polynomial_expansion import _csr_polynomial_expansion from ._encoders import OneHotEncoder From 070e2fd71c314a27a6e83a6d409b183eb79a7734 Mon Sep 17 00:00:00 2001 From: avigupta2612 Date: Sat, 13 Mar 2021 02:38:47 +0530 Subject: [PATCH 3/6] order changes --- sklearn/preprocessing/__init__.py | 2 +- sklearn/preprocessing/_polynomial.py | 1070 ++++++++--------- .../preprocessing/tests/test_polynomial.py | 6 +- 3 files changed, 539 insertions(+), 539 deletions(-) diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py index a24346dc8d045..6653088ba85a7 100644 --- a/sklearn/preprocessing/__init__.py +++ b/sklearn/preprocessing/__init__.py @@ -34,8 +34,8 @@ from ._discretization import KBinsDiscretizer -from ._polynomial import SplineTransformer from ._polynomial import PolynomialFeatures +from ._polynomial import SplineTransformer __all__ = [ diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 73a0af963ad33..1cf2135903a62 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -22,161 +22,110 @@ ] -# TODO: -# - sparse support (either scipy or own cython solution)? -# - extrapolation (cyclic) -class SplineTransformer(TransformerMixin, BaseEstimator): - """Generate univariate B-spline bases for features. - - Generate a new feature matrix consisting of - `n_splines=n_knots + degree - 1` spline basis functions (B-splines) of - polynomial order=`degree` for each feature. - - Read more in the :ref:`User Guide `. +class PolynomialFeatures(TransformerMixin, BaseEstimator): + """Generate polynomial and interaction features. - .. versionadded:: 1.0 + Generate a new feature matrix consisting of all polynomial combinations + of the features with degree less than or equal to the specified degree. + For example, if an input sample is two dimensional and of the form + [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. Parameters ---------- - n_knots : int, default=5 - Number of knots of the splines if `knots` equals one of - {'uniform', 'quantile'}. Must be larger or equal 2. - - degree : int, default=3 - The polynomial degree of the spline basis. Must be a non-negative - integer. - - knots : {'uniform', 'quantile'} or array-like of shape \ - (n_knots, n_features), default='uniform' - Set knot positions such that first knot <= features <= last knot. - - - If 'uniform', `n_knots` number of knots are distributed uniformly - from min to max values of the features. - - If 'quantile', they are distributed uniformly along the quantiles of - the features. - - If an array-like is given, it directly specifies the sorted knot - positions including the boundary knots. Note that, internally, - `degree` number of knots are added before the first knot, the same - after the last knot. + degree : int, default=2 + The degree of the polynomial features. - extrapolation : {'error', 'constant', 'linear', 'continue'}, \ - default='constant' - If 'error', values outside the min and max values of the training - features raises a `ValueError`. If 'constant', the value of the - splines at minimum and maximum value of the features is used as - constant extrapolation. If 'linear', a linear extrapolation is used. - If 'continue', the splines are extrapolated as is, i.e. option - `extrapolate=True` in :class:`scipy.interpolate.BSpline`. + interaction_only : bool, default=False + If true, only interaction features are produced: features that are + products of at most ``degree`` *distinct* input features (so not + ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). include_bias : bool, default=True - If True (default), then the last spline element inside the data range - of a feature is dropped. As B-splines sum to one over the spline basis - functions for each data point, they implicitly include a bias term, - i.e. a column of ones. It acts as an intercept term in a linear models. + If True (default), then include a bias column, the feature in which + all polynomial powers are zero (i.e. a column of ones - acts as an + intercept term in a linear model). order : {'C', 'F'}, default='C' - Order of output array. 'F' order is faster to compute, but may slow - down subsequent estimators. + Order of output array in the dense case. 'F' order is faster to + compute, but may slow down subsequent estimators. + + .. versionadded:: 0.21 + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import PolynomialFeatures + >>> X = np.arange(6).reshape(3, 2) + >>> X + array([[0, 1], + [2, 3], + [4, 5]]) + >>> poly = PolynomialFeatures(2) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0., 0., 1.], + [ 1., 2., 3., 4., 6., 9.], + [ 1., 4., 5., 16., 20., 25.]]) + >>> poly = PolynomialFeatures(interaction_only=True) + >>> poly.fit_transform(X) + array([[ 1., 0., 1., 0.], + [ 1., 2., 3., 6.], + [ 1., 4., 5., 20.]]) Attributes ---------- - bsplines_ : list of shape (n_features,) - List of BSplines objects, one for each feature. + powers_ : ndarray of shape (n_output_features, n_input_features) + powers_[i, j] is the exponent of the jth input in the ith output. - n_features_in_ : int + n_input_features_ : int The total number of input features. - n_features_out_ : int - The total number of output features, which is computed as - `n_features * n_splines`, where `n_splines` is - the number of bases elements of the B-splines, `n_knots + degree - 1`. - If `include_bias=False`, then it is only - `n_features * (n_splines - 1)`. + n_output_features_ : int + The total number of polynomial output features. The number of output + features is computed by iterating over all suitably sized combinations + of input features. See Also -------- - KBinsDiscretizer : Transformer that bins continuous data into intervals. - - PolynomialFeatures : Transformer that generates polynomial and interaction - features. + SplineTransformer : Transformer that generates univariate B-spline bases + for features Notes ----- - High degrees and a high number of knots can cause overfitting. + Be aware that the number of features in the output array scales + polynomially in the number of features of the input array, and + exponentially in the degree. High degrees can cause overfitting. See :ref:`examples/linear_model/plot_polynomial_interpolation.py - `. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.preprocessing import SplineTransformer - >>> X = np.arange(6).reshape(6, 1) - >>> spline = SplineTransformer(degree=2, n_knots=3) - >>> spline.fit_transform(X) - array([[0.5 , 0.5 , 0. , 0. ], - [0.18, 0.74, 0.08, 0. ], - [0.02, 0.66, 0.32, 0. ], - [0. , 0.32, 0.66, 0.02], - [0. , 0.08, 0.74, 0.18], - [0. , 0. , 0.5 , 0.5 ]]) + ` """ - - def __init__( - self, - n_knots=5, - degree=3, - *, - knots="uniform", - extrapolation="constant", - include_bias=True, - order="C", - ): - self.n_knots = n_knots + @_deprecate_positional_args + def __init__(self, degree=2, *, interaction_only=False, include_bias=True, + order='C'): self.degree = degree - self.knots = knots - self.extrapolation = extrapolation + self.interaction_only = interaction_only self.include_bias = include_bias self.order = order @staticmethod - def _get_base_knot_positions(X, n_knots=10, knots="uniform"): - """Calculate base knot positions. - - Base knots such that first knot <= feature <= last knot. For the - B-spline construction with scipy.interpolate.BSpline, 2*degree knots - beyond the base interval are added. + def _combinations(n_features, degree, interaction_only, include_bias): + comb = (combinations if interaction_only else combinations_w_r) + start = int(not include_bias) + return chain.from_iterable(comb(range(n_features), i) + for i in range(start, degree + 1)) - Returns - ------- - knots : ndarray of shape (n_knots, n_features), dtype=np.float64 - Knot positions (points) of base interval. - """ - if knots == "quantile": - knots = np.percentile( - X, - 100 - * np.linspace(start=0, stop=1, num=n_knots, dtype=np.float64), - axis=0, - ) - else: - # knots == 'uniform': - # Note that the variable `knots` has already been validated and - # `else` is therefore safe. - x_min = np.amin(X, axis=0) - x_max = np.amax(X, axis=0) - knots = linspace( - start=x_min, - stop=x_max, - num=n_knots, - endpoint=True, - dtype=np.float64, - ) + @property + def powers_(self): + check_is_fitted(self) - return knots + combinations = self._combinations(self.n_input_features_, self.degree, + self.interaction_only, + self.include_bias) + return np.vstack([np.bincount(c, minlength=self.n_input_features_) + for c in combinations]) def get_feature_names(self, input_features=None): - """Return feature names for output features. + """ + Return feature names for output features Parameters ---------- @@ -188,21 +137,29 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ - n_splines = self.bsplines_[0].c.shape[0] + powers = self.powers_ if input_features is None: - input_features = ["x%d" % i for i in range(self.n_features_in_)] + input_features = ['x%d' % i for i in range(powers.shape[1])] feature_names = [] - for i in range(self.n_features_in_): - for j in range(n_splines - 1 + self.include_bias): - feature_names.append(f"{input_features[i]}_sp_{j}") + for row in powers: + inds = np.where(row)[0] + if len(inds): + name = " ".join("%s^%d" % (input_features[ind], exp) + if exp != 1 else input_features[ind] + for ind, exp in zip(inds, row[inds])) + else: + name = "1" + feature_names.append(name) return feature_names def fit(self, X, y=None): - """Compute knot positions of splines. + """ + Compute number of output features. + Parameters ---------- - X : array-like of shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) The data. y : None @@ -213,331 +170,300 @@ def fit(self, X, y=None): self : object Fitted transformer. """ - X = self._validate_data( - X, - reset=True, - accept_sparse=False, - ensure_min_samples=2, - ensure_2d=True, - ) - n_samples, n_features = X.shape + n_samples, n_features = self._validate_data( + X, accept_sparse=True).shape + combinations = self._combinations(n_features, self.degree, + self.interaction_only, + self.include_bias) + self.n_input_features_ = n_features + self.n_output_features_ = sum(1 for _ in combinations) + return self - if not ( - isinstance(self.degree, numbers.Integral) and self.degree >= 0 - ): - raise ValueError("degree must be a non-negative integer.") + def transform(self, X): + """Transform data to polynomial features. - if not ( - isinstance(self.n_knots, numbers.Integral) and self.n_knots >= 2 - ): - raise ValueError("n_knots must be a positive integer >= 2.") + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The data to transform, row by row. - if isinstance(self.knots, str) and self.knots in [ - "uniform", - "quantile", - ]: - base_knots = self._get_base_knot_positions( - X, n_knots=self.n_knots, knots=self.knots - ) - else: - base_knots = check_array(self.knots) - if base_knots.shape[0] < 2: - raise ValueError( - "Number of knots, knots.shape[0], must be >= " "2." - ) - elif base_knots.shape[1] != n_features: - raise ValueError("knots.shape[1] == n_features is violated.") - elif not np.all(np.diff(base_knots, axis=0) > 0): - raise ValueError("knots must be sorted without duplicates.") - - if self.extrapolation not in ( - "error", - "constant", - "linear", - "continue", - ): - raise ValueError( - "extrapolation must be one of 'error', " - "'constant', 'linear' or 'continue'." - ) - - if not isinstance(self.include_bias, (bool, np.bool_)): - raise ValueError("include_bias must be bool.") - - # number of knots for base interval - n_knots = base_knots.shape[0] - # number of splines basis functions - n_splines = n_knots + self.degree - 1 - degree = self.degree - n_out = n_features * n_splines - # We have to add degree number of knots below, and degree number knots - # above the base knots in order to make the spline basis complete. - # Eilers & Marx in "Flexible smoothing with B-splines and penalties" - # https://doi.org/10.1214/ss/1038425655 advice against repeating first - # and last knot several times, which would have inferior behaviour at - # boundaries if combined with a penalty (hence P-Spline). We follow - # this advice even if our splines are unpenalized. - # Meaning we do not: - # knots = np.r_[np.tile(base_knots.min(axis=0), reps=[degree, 1]), - # base_knots, - # np.tile(base_knots.max(axis=0), reps=[degree, 1]) - # ] - # Instead, we reuse the distance of the 2 fist/last knots. - dist_min = base_knots[1] - base_knots[0] - dist_max = base_knots[-1] - base_knots[-2] - knots = np.r_[ - linspace( - base_knots[0] - degree * dist_min, - base_knots[0] - dist_min, - num=degree, - ), - base_knots, - linspace( - base_knots[-1] + dist_max, - base_knots[-1] + degree * dist_max, - num=degree, - ), - ] - - # With a diagonal coefficient matrix, we get back the spline basis - # elements, i.e. the design matrix of the spline. - # Note, BSpline appreciates C-contiguous float64 arrays as c=coef. - coef = np.eye(n_knots + self.degree - 1, dtype=np.float64) - extrapolate = self.extrapolation == "continue" - bsplines = [ - BSpline.construct_fast( - knots[:, i], coef, self.degree, extrapolate=extrapolate - ) - for i in range(n_features) - ] - self.bsplines_ = bsplines - - self.n_features_out_ = n_out - n_features * (1 - self.include_bias) - return self - - def transform(self, X): - """Transform each feature data to B-splines. + Prefer CSR over CSC for sparse input (for speed), but CSC is + required if the degree is 4 or higher. If the degree is less than + 4 and the input format is CSC, it will be converted to CSR, have + its polynomial features generated, then converted back to CSC. - Parameters - ---------- - X : array-like of shape (n_samples, n_features) - The data to transform. + If the degree is 2 or 3, the method described in "Leveraging + Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices + Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is + used, which is much faster than the method used on CSC input. For + this reason, a CSC input will be converted to CSR, and the output + will be converted back to CSC prior to being returned, hence the + preference of CSR. Returns ------- - XBS : ndarray of shape (n_samples, n_features * n_splines) - The matrix of features, where n_splines is the number of bases - elements of the B-splines, n_knots + degree - 1. + XP : {ndarray, sparse matrix} of shape (n_samples, NP) + The matrix of features, where NP is the number of polynomial + features generated from the combination of inputs. If a sparse + matrix is provided, it will be converted into a sparse + ``csr_matrix``. """ check_is_fitted(self) - X = self._validate_data( - X, reset=False, accept_sparse=False, ensure_2d=True - ) + X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, + accept_sparse=('csr', 'csc')) n_samples, n_features = X.shape - n_splines = self.bsplines_[0].c.shape[0] - degree = self.degree - # Note that scipy BSpline returns float64 arrays and converts input - # x=X[:, i] to c-contiguous float64. - n_out = self.n_features_out_ + n_features * (1 - self.include_bias) - if X.dtype in FLOAT_DTYPES: - dtype = X.dtype + if n_features != self.n_input_features_: + raise ValueError("X shape does not match training shape") + + if sparse.isspmatrix_csr(X): + if self.degree > 3: + return self.transform(X.tocsc()).tocsr() + to_stack = [] + if self.include_bias: + to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) + to_stack.append(X) + for deg in range(2, self.degree+1): + Xp_next = _csr_polynomial_expansion(X.data, X.indices, + X.indptr, X.shape[1], + self.interaction_only, + deg) + if Xp_next is None: + break + to_stack.append(Xp_next) + XP = sparse.hstack(to_stack, format='csr') + elif sparse.isspmatrix_csc(X) and self.degree < 4: + return self.transform(X.tocsr()).tocsc() else: - dtype = np.float64 - XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order) + if sparse.isspmatrix(X): + combinations = self._combinations(n_features, self.degree, + self.interaction_only, + self.include_bias) + columns = [] + for comb in combinations: + if comb: + out_col = 1 + for col_idx in comb: + out_col = X[:, col_idx].multiply(out_col) + columns.append(out_col) + else: + bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) + columns.append(bias) + XP = sparse.hstack(columns, dtype=X.dtype).tocsc() + else: + XP = np.empty((n_samples, self.n_output_features_), + dtype=X.dtype, order=self.order) - for i in range(n_features): - spl = self.bsplines_[i] + # What follows is a faster implementation of: + # for i, comb in enumerate(combinations): + # XP[:, i] = X[:, comb].prod(1) + # This implementation uses two optimisations. + # First one is broadcasting, + # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] + # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] + # ... + # multiply ([X[:, start:end], X[:, start]) -> ... + # Second optimisation happens for degrees >= 3. + # Xi^3 is computed reusing previous computation: + # Xi^3 = Xi^2 * Xi. - if self.extrapolation in ("continue", "error"): - XBS[:, (i * n_splines):((i + 1) * n_splines)] = spl(X[:, i]) - else: - xmin = spl.t[degree] - xmax = spl.t[-degree - 1] - mask = (xmin <= X[:, i]) & (X[:, i] <= xmax) - XBS[mask, (i * n_splines):((i + 1) * n_splines)] = spl( - X[mask, i] - ) + if self.include_bias: + XP[:, 0] = 1 + current_col = 1 + else: + current_col = 0 - # Note for extrapolation: - # 'continue' is already returned as is by scipy BSplines - if self.extrapolation == "error": - # BSpline with extrapolate=False does not raise an error, but - # output np.nan. - if np.any( - np.isnan(XBS[:, (i * n_splines):((i + 1) * n_splines)]) - ): - raise ValueError( - "X contains values beyond the limits of the knots." - ) - elif self.extrapolation == "constant": - # Set all values beyond xmin and xmax to the value of the - # spline basis functions at those two positions. - # Only the first degree and last degree number of splines - # have non-zero values at the boundaries. + # d = 0 + XP[:, current_col:current_col + n_features] = X + index = list(range(current_col, + current_col + n_features)) + current_col += n_features + index.append(current_col) - # spline values at boundaries - f_min = spl(xmin) - f_max = spl(xmax) - mask = X[:, i] < xmin - if np.any(mask): - XBS[ - mask, (i * n_splines):(i * n_splines + degree) - ] = f_min[:degree] + # d >= 1 + for _ in range(1, self.degree): + new_index = [] + end = index[-1] + for feature_idx in range(n_features): + start = index[feature_idx] + new_index.append(current_col) + if self.interaction_only: + start += (index[feature_idx + 1] - + index[feature_idx]) + next_col = current_col + end - start + if next_col <= current_col: + break + # XP[:, start:end] are terms of degree d - 1 + # that exclude feature #feature_idx. + np.multiply(XP[:, start:end], + X[:, feature_idx:feature_idx + 1], + out=XP[:, current_col:next_col], + casting='no') + current_col = next_col - mask = X[:, i] > xmax - if np.any(mask): - XBS[ - mask, - ((i + 1) * n_splines - degree):((i + 1) * n_splines), - ] = f_max[-degree:] - elif self.extrapolation == "linear": - # Continue the degree first and degree last spline bases - # linearly beyond the boundaries, with slope = derivative at - # the boundary. - # Note that all others have derivative = value = 0 at the - # boundaries. + new_index.append(current_col) + index = new_index - # spline values at boundaries - f_min, f_max = spl(xmin), spl(xmax) - # spline derivatives = slopes at boundaries - fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1) - # Compute the linear continuation. - if degree <= 1: - # For degree=1, the derivative of 2nd spline is not zero at - # boundary. For degree=0 it is the same as 'constant'. - degree += 1 - for j in range(degree): - mask = X[:, i] < xmin - if np.any(mask): - XBS[mask, i * n_splines + j] = ( - f_min[j] + (X[mask, i] - xmin) * fp_min[j] - ) + return XP - mask = X[:, i] > xmax - if np.any(mask): - k = n_splines - 1 - j - XBS[mask, i * n_splines + k] = ( - f_max[k] + (X[mask, i] - xmax) * fp_max[k] - ) - if self.include_bias: - return XBS - else: - # We throw away one spline basis per feature. - # We chose the last one. - indices = [ - j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0 - ] - return XBS[:, indices] +# TODO: +# - sparse support (either scipy or own cython solution)? +# - extrapolation (cyclic) +class SplineTransformer(TransformerMixin, BaseEstimator): + """Generate univariate B-spline bases for features. + Generate a new feature matrix consisting of + `n_splines=n_knots + degree - 1` spline basis functions (B-splines) of + polynomial order=`degree` for each feature. -class PolynomialFeatures(TransformerMixin, BaseEstimator): - """Generate polynomial and interaction features. + Read more in the :ref:`User Guide `. - Generate a new feature matrix consisting of all polynomial combinations - of the features with degree less than or equal to the specified degree. - For example, if an input sample is two dimensional and of the form - [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. + .. versionadded:: 1.0 + + Parameters + ---------- + n_knots : int, default=5 + Number of knots of the splines if `knots` equals one of + {'uniform', 'quantile'}. Must be larger or equal 2. + + degree : int, default=3 + The polynomial degree of the spline basis. Must be a non-negative + integer. + + knots : {'uniform', 'quantile'} or array-like of shape \ + (n_knots, n_features), default='uniform' + Set knot positions such that first knot <= features <= last knot. - Parameters - ---------- - degree : int, default=2 - The degree of the polynomial features. + - If 'uniform', `n_knots` number of knots are distributed uniformly + from min to max values of the features. + - If 'quantile', they are distributed uniformly along the quantiles of + the features. + - If an array-like is given, it directly specifies the sorted knot + positions including the boundary knots. Note that, internally, + `degree` number of knots are added before the first knot, the same + after the last knot. - interaction_only : bool, default=False - If true, only interaction features are produced: features that are - products of at most ``degree`` *distinct* input features (so not - ``x[1] ** 2``, ``x[0] * x[2] ** 3``, etc.). + extrapolation : {'error', 'constant', 'linear', 'continue'}, \ + default='constant' + If 'error', values outside the min and max values of the training + features raises a `ValueError`. If 'constant', the value of the + splines at minimum and maximum value of the features is used as + constant extrapolation. If 'linear', a linear extrapolation is used. + If 'continue', the splines are extrapolated as is, i.e. option + `extrapolate=True` in :class:`scipy.interpolate.BSpline`. include_bias : bool, default=True - If True (default), then include a bias column, the feature in which - all polynomial powers are zero (i.e. a column of ones - acts as an - intercept term in a linear model). + If True (default), then the last spline element inside the data range + of a feature is dropped. As B-splines sum to one over the spline basis + functions for each data point, they implicitly include a bias term, + i.e. a column of ones. It acts as an intercept term in a linear models. order : {'C', 'F'}, default='C' - Order of output array in the dense case. 'F' order is faster to - compute, but may slow down subsequent estimators. - - .. versionadded:: 0.21 - - Examples - -------- - >>> import numpy as np - >>> from sklearn.preprocessing import PolynomialFeatures - >>> X = np.arange(6).reshape(3, 2) - >>> X - array([[0, 1], - [2, 3], - [4, 5]]) - >>> poly = PolynomialFeatures(2) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0., 0., 1.], - [ 1., 2., 3., 4., 6., 9.], - [ 1., 4., 5., 16., 20., 25.]]) - >>> poly = PolynomialFeatures(interaction_only=True) - >>> poly.fit_transform(X) - array([[ 1., 0., 1., 0.], - [ 1., 2., 3., 6.], - [ 1., 4., 5., 20.]]) + Order of output array. 'F' order is faster to compute, but may slow + down subsequent estimators. Attributes ---------- - powers_ : ndarray of shape (n_output_features, n_input_features) - powers_[i, j] is the exponent of the jth input in the ith output. + bsplines_ : list of shape (n_features,) + List of BSplines objects, one for each feature. - n_input_features_ : int + n_features_in_ : int The total number of input features. - n_output_features_ : int - The total number of polynomial output features. The number of output - features is computed by iterating over all suitably sized combinations - of input features. + n_features_out_ : int + The total number of output features, which is computed as + `n_features * n_splines`, where `n_splines` is + the number of bases elements of the B-splines, `n_knots + degree - 1`. + If `include_bias=False`, then it is only + `n_features * (n_splines - 1)`. See Also -------- - SplineTransformer : Transformer that generates univariate B-spline bases - for features + KBinsDiscretizer : Transformer that bins continuous data into intervals. + + PolynomialFeatures : Transformer that generates polynomial and interaction + features. Notes ----- - Be aware that the number of features in the output array scales - polynomially in the number of features of the input array, and - exponentially in the degree. High degrees can cause overfitting. + High degrees and a high number of knots can cause overfitting. See :ref:`examples/linear_model/plot_polynomial_interpolation.py - ` + `. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.preprocessing import SplineTransformer + >>> X = np.arange(6).reshape(6, 1) + >>> spline = SplineTransformer(degree=2, n_knots=3) + >>> spline.fit_transform(X) + array([[0.5 , 0.5 , 0. , 0. ], + [0.18, 0.74, 0.08, 0. ], + [0.02, 0.66, 0.32, 0. ], + [0. , 0.32, 0.66, 0.02], + [0. , 0.08, 0.74, 0.18], + [0. , 0. , 0.5 , 0.5 ]]) """ - @_deprecate_positional_args - def __init__(self, degree=2, *, interaction_only=False, include_bias=True, - order='C'): + + def __init__( + self, + n_knots=5, + degree=3, + *, + knots="uniform", + extrapolation="constant", + include_bias=True, + order="C", + ): + self.n_knots = n_knots self.degree = degree - self.interaction_only = interaction_only + self.knots = knots + self.extrapolation = extrapolation self.include_bias = include_bias self.order = order @staticmethod - def _combinations(n_features, degree, interaction_only, include_bias): - comb = (combinations if interaction_only else combinations_w_r) - start = int(not include_bias) - return chain.from_iterable(comb(range(n_features), i) - for i in range(start, degree + 1)) + def _get_base_knot_positions(X, n_knots=10, knots="uniform"): + """Calculate base knot positions. - @property - def powers_(self): - check_is_fitted(self) + Base knots such that first knot <= feature <= last knot. For the + B-spline construction with scipy.interpolate.BSpline, 2*degree knots + beyond the base interval are added. - combinations = self._combinations(self.n_input_features_, self.degree, - self.interaction_only, - self.include_bias) - return np.vstack([np.bincount(c, minlength=self.n_input_features_) - for c in combinations]) + Returns + ------- + knots : ndarray of shape (n_knots, n_features), dtype=np.float64 + Knot positions (points) of base interval. + """ + if knots == "quantile": + knots = np.percentile( + X, + 100 + * np.linspace(start=0, stop=1, num=n_knots, dtype=np.float64), + axis=0, + ) + else: + # knots == 'uniform': + # Note that the variable `knots` has already been validated and + # `else` is therefore safe. + x_min = np.amin(X, axis=0) + x_max = np.amax(X, axis=0) + knots = linspace( + start=x_min, + stop=x_max, + num=n_knots, + endpoint=True, + dtype=np.float64, + ) + + return knots def get_feature_names(self, input_features=None): - """ - Return feature names for output features + """Return feature names for output features. Parameters ---------- @@ -549,173 +475,247 @@ def get_feature_names(self, input_features=None): ------- output_feature_names : list of str of shape (n_output_features,) """ - powers = self.powers_ + n_splines = self.bsplines_[0].c.shape[0] if input_features is None: - input_features = ['x%d' % i for i in range(powers.shape[1])] + input_features = ["x%d" % i for i in range(self.n_features_in_)] feature_names = [] - for row in powers: - inds = np.where(row)[0] - if len(inds): - name = " ".join("%s^%d" % (input_features[ind], exp) - if exp != 1 else input_features[ind] - for ind, exp in zip(inds, row[inds])) - else: - name = "1" - feature_names.append(name) + for i in range(self.n_features_in_): + for j in range(n_splines - 1 + self.include_bias): + feature_names.append(f"{input_features[i]}_sp_{j}") return feature_names def fit(self, X, y=None): - """ - Compute number of output features. - + """Compute knot positions of splines. Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) The data. - y : None - Ignored. + y : None + Ignored. + + Returns + ------- + self : object + Fitted transformer. + """ + X = self._validate_data( + X, + reset=True, + accept_sparse=False, + ensure_min_samples=2, + ensure_2d=True, + ) + n_samples, n_features = X.shape + + if not ( + isinstance(self.degree, numbers.Integral) and self.degree >= 0 + ): + raise ValueError("degree must be a non-negative integer.") + + if not ( + isinstance(self.n_knots, numbers.Integral) and self.n_knots >= 2 + ): + raise ValueError("n_knots must be a positive integer >= 2.") + + if isinstance(self.knots, str) and self.knots in [ + "uniform", + "quantile", + ]: + base_knots = self._get_base_knot_positions( + X, n_knots=self.n_knots, knots=self.knots + ) + else: + base_knots = check_array(self.knots) + if base_knots.shape[0] < 2: + raise ValueError( + "Number of knots, knots.shape[0], must be >= " "2." + ) + elif base_knots.shape[1] != n_features: + raise ValueError("knots.shape[1] == n_features is violated.") + elif not np.all(np.diff(base_knots, axis=0) > 0): + raise ValueError("knots must be sorted without duplicates.") + + if self.extrapolation not in ( + "error", + "constant", + "linear", + "continue", + ): + raise ValueError( + "extrapolation must be one of 'error', " + "'constant', 'linear' or 'continue'." + ) + + if not isinstance(self.include_bias, (bool, np.bool_)): + raise ValueError("include_bias must be bool.") + + # number of knots for base interval + n_knots = base_knots.shape[0] + # number of splines basis functions + n_splines = n_knots + self.degree - 1 + degree = self.degree + n_out = n_features * n_splines + # We have to add degree number of knots below, and degree number knots + # above the base knots in order to make the spline basis complete. + # Eilers & Marx in "Flexible smoothing with B-splines and penalties" + # https://doi.org/10.1214/ss/1038425655 advice against repeating first + # and last knot several times, which would have inferior behaviour at + # boundaries if combined with a penalty (hence P-Spline). We follow + # this advice even if our splines are unpenalized. + # Meaning we do not: + # knots = np.r_[np.tile(base_knots.min(axis=0), reps=[degree, 1]), + # base_knots, + # np.tile(base_knots.max(axis=0), reps=[degree, 1]) + # ] + # Instead, we reuse the distance of the 2 fist/last knots. + dist_min = base_knots[1] - base_knots[0] + dist_max = base_knots[-1] - base_knots[-2] + knots = np.r_[ + linspace( + base_knots[0] - degree * dist_min, + base_knots[0] - dist_min, + num=degree, + ), + base_knots, + linspace( + base_knots[-1] + dist_max, + base_knots[-1] + degree * dist_max, + num=degree, + ), + ] + + # With a diagonal coefficient matrix, we get back the spline basis + # elements, i.e. the design matrix of the spline. + # Note, BSpline appreciates C-contiguous float64 arrays as c=coef. + coef = np.eye(n_knots + self.degree - 1, dtype=np.float64) + extrapolate = self.extrapolation == "continue" + bsplines = [ + BSpline.construct_fast( + knots[:, i], coef, self.degree, extrapolate=extrapolate + ) + for i in range(n_features) + ] + self.bsplines_ = bsplines - Returns - ------- - self : object - Fitted transformer. - """ - n_samples, n_features = self._validate_data( - X, accept_sparse=True).shape - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) - self.n_input_features_ = n_features - self.n_output_features_ = sum(1 for _ in combinations) + self.n_features_out_ = n_out - n_features * (1 - self.include_bias) return self def transform(self, X): - """Transform data to polynomial features. + """Transform each feature data to B-splines. Parameters ---------- - X : {array-like, sparse matrix} of shape (n_samples, n_features) - The data to transform, row by row. - - Prefer CSR over CSC for sparse input (for speed), but CSC is - required if the degree is 4 or higher. If the degree is less than - 4 and the input format is CSC, it will be converted to CSR, have - its polynomial features generated, then converted back to CSC. - - If the degree is 2 or 3, the method described in "Leveraging - Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices - Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is - used, which is much faster than the method used on CSC input. For - this reason, a CSC input will be converted to CSR, and the output - will be converted back to CSC prior to being returned, hence the - preference of CSR. + X : array-like of shape (n_samples, n_features) + The data to transform. Returns ------- - XP : {ndarray, sparse matrix} of shape (n_samples, NP) - The matrix of features, where NP is the number of polynomial - features generated from the combination of inputs. If a sparse - matrix is provided, it will be converted into a sparse - ``csr_matrix``. + XBS : ndarray of shape (n_samples, n_features * n_splines) + The matrix of features, where n_splines is the number of bases + elements of the B-splines, n_knots + degree - 1. """ check_is_fitted(self) - X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False, - accept_sparse=('csr', 'csc')) + X = self._validate_data( + X, reset=False, accept_sparse=False, ensure_2d=True + ) n_samples, n_features = X.shape + n_splines = self.bsplines_[0].c.shape[0] + degree = self.degree - if n_features != self.n_input_features_: - raise ValueError("X shape does not match training shape") - - if sparse.isspmatrix_csr(X): - if self.degree > 3: - return self.transform(X.tocsc()).tocsr() - to_stack = [] - if self.include_bias: - to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype)) - to_stack.append(X) - for deg in range(2, self.degree+1): - Xp_next = _csr_polynomial_expansion(X.data, X.indices, - X.indptr, X.shape[1], - self.interaction_only, - deg) - if Xp_next is None: - break - to_stack.append(Xp_next) - XP = sparse.hstack(to_stack, format='csr') - elif sparse.isspmatrix_csc(X) and self.degree < 4: - return self.transform(X.tocsr()).tocsc() + # Note that scipy BSpline returns float64 arrays and converts input + # x=X[:, i] to c-contiguous float64. + n_out = self.n_features_out_ + n_features * (1 - self.include_bias) + if X.dtype in FLOAT_DTYPES: + dtype = X.dtype else: - if sparse.isspmatrix(X): - combinations = self._combinations(n_features, self.degree, - self.interaction_only, - self.include_bias) - columns = [] - for comb in combinations: - if comb: - out_col = 1 - for col_idx in comb: - out_col = X[:, col_idx].multiply(out_col) - columns.append(out_col) - else: - bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) - columns.append(bias) - XP = sparse.hstack(columns, dtype=X.dtype).tocsc() + dtype = np.float64 + XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order) + + for i in range(n_features): + spl = self.bsplines_[i] + + if self.extrapolation in ("continue", "error"): + XBS[:, (i * n_splines):((i + 1) * n_splines)] = spl(X[:, i]) else: - XP = np.empty((n_samples, self.n_output_features_), - dtype=X.dtype, order=self.order) + xmin = spl.t[degree] + xmax = spl.t[-degree - 1] + mask = (xmin <= X[:, i]) & (X[:, i] <= xmax) + XBS[mask, (i * n_splines):((i + 1) * n_splines)] = spl( + X[mask, i] + ) - # What follows is a faster implementation of: - # for i, comb in enumerate(combinations): - # XP[:, i] = X[:, comb].prod(1) - # This implementation uses two optimisations. - # First one is broadcasting, - # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] - # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] - # ... - # multiply ([X[:, start:end], X[:, start]) -> ... - # Second optimisation happens for degrees >= 3. - # Xi^3 is computed reusing previous computation: - # Xi^3 = Xi^2 * Xi. + # Note for extrapolation: + # 'continue' is already returned as is by scipy BSplines + if self.extrapolation == "error": + # BSpline with extrapolate=False does not raise an error, but + # output np.nan. + if np.any( + np.isnan(XBS[:, (i * n_splines):((i + 1) * n_splines)]) + ): + raise ValueError( + "X contains values beyond the limits of the knots." + ) + elif self.extrapolation == "constant": + # Set all values beyond xmin and xmax to the value of the + # spline basis functions at those two positions. + # Only the first degree and last degree number of splines + # have non-zero values at the boundaries. - if self.include_bias: - XP[:, 0] = 1 - current_col = 1 - else: - current_col = 0 + # spline values at boundaries + f_min = spl(xmin) + f_max = spl(xmax) + mask = X[:, i] < xmin + if np.any(mask): + XBS[ + mask, (i * n_splines):(i * n_splines + degree) + ] = f_min[:degree] - # d = 0 - XP[:, current_col:current_col + n_features] = X - index = list(range(current_col, - current_col + n_features)) - current_col += n_features - index.append(current_col) + mask = X[:, i] > xmax + if np.any(mask): + XBS[ + mask, + ((i + 1) * n_splines - degree):((i + 1) * n_splines), + ] = f_max[-degree:] + elif self.extrapolation == "linear": + # Continue the degree first and degree last spline bases + # linearly beyond the boundaries, with slope = derivative at + # the boundary. + # Note that all others have derivative = value = 0 at the + # boundaries. - # d >= 1 - for _ in range(1, self.degree): - new_index = [] - end = index[-1] - for feature_idx in range(n_features): - start = index[feature_idx] - new_index.append(current_col) - if self.interaction_only: - start += (index[feature_idx + 1] - - index[feature_idx]) - next_col = current_col + end - start - if next_col <= current_col: - break - # XP[:, start:end] are terms of degree d - 1 - # that exclude feature #feature_idx. - np.multiply(XP[:, start:end], - X[:, feature_idx:feature_idx + 1], - out=XP[:, current_col:next_col], - casting='no') - current_col = next_col + # spline values at boundaries + f_min, f_max = spl(xmin), spl(xmax) + # spline derivatives = slopes at boundaries + fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1) + # Compute the linear continuation. + if degree <= 1: + # For degree=1, the derivative of 2nd spline is not zero at + # boundary. For degree=0 it is the same as 'constant'. + degree += 1 + for j in range(degree): + mask = X[:, i] < xmin + if np.any(mask): + XBS[mask, i * n_splines + j] = ( + f_min[j] + (X[mask, i] - xmin) * fp_min[j] + ) - new_index.append(current_col) - index = new_index + mask = X[:, i] > xmax + if np.any(mask): + k = n_splines - 1 - j + XBS[mask, i * n_splines + k] = ( + f_max[k] + (X[mask, i] - xmax) * fp_max[k] + ) - return XP + if self.include_bias: + return XBS + else: + # We throw away one spline basis per feature. + # We chose the last one. + indices = [ + j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0 + ] + return XBS[:, indices] \ No newline at end of file diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 53d4c74005f07..af45542bcbad8 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -8,11 +8,11 @@ from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline -from sklearn.preprocessing import KBinsDiscretizer, SplineTransformer -from sklearn.preprocessing import PolynomialFeatures +from sklearn.preprocessing import ( + KBinsDiscretizer, PolynomialFeatures, SplineTransformer +) -# TODO: add PolynomialFeatures if it moves to _polynomial.py @pytest.mark.parametrize("est", (SplineTransformer,)) def test_polynomial_and_spline_array_order(est): """Test that output array has the given order.""" From a5b0c7fcf351afdeb49a0ad06bc35ba5970acbf2 Mon Sep 17 00:00:00 2001 From: avigupta2612 Date: Sat, 13 Mar 2021 02:43:55 +0530 Subject: [PATCH 4/6] order changes --- sklearn/preprocessing/_polynomial.py | 2 +- sklearn/preprocessing/tests/test_polynomial.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/_polynomial.py b/sklearn/preprocessing/_polynomial.py index 1cf2135903a62..5e1a0fad4abe0 100644 --- a/sklearn/preprocessing/_polynomial.py +++ b/sklearn/preprocessing/_polynomial.py @@ -718,4 +718,4 @@ def transform(self, X): indices = [ j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0 ] - return XBS[:, indices] \ No newline at end of file + return XBS[:, indices] diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index af45542bcbad8..32edc664b36d5 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -10,7 +10,7 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import ( KBinsDiscretizer, PolynomialFeatures, SplineTransformer -) +) @pytest.mark.parametrize("est", (SplineTransformer,)) From 9fc135d7191970800dbe251edabad26b25ed48c0 Mon Sep 17 00:00:00 2001 From: avigupta2612 Date: Tue, 16 Mar 2021 18:35:32 +0530 Subject: [PATCH 5/6] removed test_polynomial_test_polynomial_feature_array_order --- sklearn/preprocessing/tests/test_polynomial.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 32edc664b36d5..31d97c021bc72 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -13,7 +13,7 @@ ) -@pytest.mark.parametrize("est", (SplineTransformer,)) +@pytest.mark.parametrize("est", (PolynomialFeatures, SplineTransformer)) def test_polynomial_and_spline_array_order(est): """Test that output array has the given order.""" X = np.arange(10).reshape(5, 2) @@ -321,18 +321,6 @@ def test_polynomial_feature_names(): feature_names) -def test_polynomial_feature_array_order(): - """Test that output array has the given order.""" - X = np.arange(10).reshape(5, 2) - - def is_c_contiguous(a): - return np.isfortran(a.T) - - assert is_c_contiguous(PolynomialFeatures().fit_transform(X)) - assert is_c_contiguous(PolynomialFeatures(order='C').fit_transform(X)) - assert np.isfortran(PolynomialFeatures(order='F').fit_transform(X)) - - @pytest.mark.parametrize(['deg', 'include_bias', 'interaction_only', 'dtype'], [(1, True, False, int), (2, True, False, int), From bd544d2d32180607f384f74f22ec9434222f8921 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 18 Mar 2021 14:11:15 +0100 Subject: [PATCH 6/6] Fix imports following merge conflict --- sklearn/preprocessing/tests/test_polynomial.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/preprocessing/tests/test_polynomial.py b/sklearn/preprocessing/tests/test_polynomial.py index 4545589f81b2c..5068a8c7d8bdd 100644 --- a/sklearn/preprocessing/tests/test_polynomial.py +++ b/sklearn/preprocessing/tests/test_polynomial.py @@ -1,5 +1,8 @@ import numpy as np import pytest +from scipy import sparse +from scipy.sparse import random as sparse_random +from sklearn.utils._testing import assert_array_almost_equal from numpy.testing import assert_allclose, assert_array_equal from scipy.interpolate import BSpline