Skip to content

Commit 90e12b8

Browse files
committed
refactor code and factors out the int64 casting
1 parent fe39a7b commit 90e12b8

File tree

2 files changed

+35
-30
lines changed

2 files changed

+35
-30
lines changed

sklearn/preprocessing/_csr_polynomial_expansion.pyx

+4-26
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ from scipy.sparse import csr_matrix
88
from numpy cimport ndarray
99
import numpy as np
1010
cimport numpy as np
11-
cdef int MAX_INT32 = 2147483647
12-
# TODO: use finfo instead
1311

1412
np.import_array()
1513
ctypedef fused INDEX_T:
@@ -56,7 +54,9 @@ cdef inline INDEX_T _deg3_column(INDEX_T d, INDEX_T i, INDEX_T j, INDEX_T k,
5654
def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
5755
ndarray[INDEX_T, ndim=1] indices,
5856
ndarray[INDEX_T, ndim=1] indptr,
59-
INDEX_T d, INDEX_T interaction_only,
57+
INDEX_T d,
58+
INDEX_T expanded_dimensionality,
59+
INDEX_T interaction_only,
6060
INDEX_T degree):
6161
"""
6262
Perform a second-degree polynomial or interaction expansion on a scipy
@@ -91,13 +91,6 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
9191
Matrices Using K-Simplex Numbers" by Andrew Nystrom and John Hughes.
9292
"""
9393

94-
assert degree in (2, 3)
95-
96-
if degree == 2:
97-
expanded_dimensionality = int((d**2 + d) / 2 - interaction_only*d)
98-
else:
99-
expanded_dimensionality = int((d**3 + 3*d**2 + 2*d) / 6
100-
- interaction_only*d**2)
10194
if expanded_dimensionality == 0:
10295
return None
10396
assert expanded_dimensionality > 0
@@ -106,12 +99,7 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
10699

107100
# Count how many nonzero elements the expanded matrix will contain.
108101
for row_i in range(indptr.shape[0]-1):
109-
# nnz is the number of nonzero elements in this row.
110-
# The number of nonzero elements can explode
111-
# in the expanded space, so we cast to int64
112-
# before the expansion computation to
113-
# avoid overflow:
114-
nnz = np.int64(indptr[row_i + 1] - indptr[row_i])
102+
nnz = indptr[row_i + 1] - indptr[row_i]
115103
# TODO: check that the casting is indeed done
116104
if degree == 2:
117105
total_nnz += (nnz ** 2 + nnz) / 2 - interaction_only * nnz
@@ -128,16 +116,6 @@ def _csr_polynomial_expansion(ndarray[DATA_T, ndim=1] data,
128116
cdef ndarray[INDEX_T, ndim=1] expanded_indptr = ndarray(
129117
shape=num_rows + 1, dtype=indptr.dtype)
130118

131-
# if the expanded_dimensionality is too big, we need to cast
132-
# indices to int64. If total_nnz is too big, we need to cast
133-
# expanded_indptr to int64 too.
134-
# TODO: check that the casting is indeed done
135-
if expanded_dimensionality > MAX_INT32:
136-
indices = np.array(indices, dtype=np.int64)
137-
expanded_indices = np.array(expanded_indices, dtype=np.int64)
138-
if total_nnz > MAX_INT32:
139-
expanded_indptr = np.array(expanded_indptr, dtype=np.int64)
140-
141119
cdef INDEX_T expanded_index = 0, row_starts, row_ends, i, j, k, \
142120
i_ptr, j_ptr, k_ptr, num_cols_in_row, col
143121

sklearn/preprocessing/_polynomial.py

+31-4
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,32 @@
2323
]
2424

2525

26+
def _cast_to_int64_if_needed(X, degree, interaction_only):
27+
"""Computes the expanded dimensionality and casts X to int64 if the
28+
expanded dim is too big"""
29+
d = int(X.shape[1])
30+
assert degree in (2, 3)
31+
if degree == 2:
32+
expanded_dimensionality = (d ** 2 + d) / 2 - interaction_only * d
33+
else:
34+
expanded_dimensionality = ((d ** 3 + 3 * d ** 2 + 2 * d) / 6 -
35+
interaction_only * d ** 2)
36+
if expanded_dimensionality > np.iinfo(np.int64).max:
37+
raise ValueError("The expanded dimensionality will be too big to "
38+
"be contained in an int64.")
39+
if expanded_dimensionality > np.iinfo(np.int32).max:
40+
# if the expansion needs int64 ints, we cast every index value in X
41+
# to int64
42+
X = X.copy()
43+
X.data = X.data.astype(np.int64)
44+
X.indices = X.indices.astype(np.int64)
45+
X.indptr = X.indptr.astype(np.int64)
46+
return X, np.int64(d), np.int64(expanded_dimensionality)
47+
else:
48+
# otherwise we keep X as is
49+
return X, np.int32(d), np.int32(expanded_dimensionality)
50+
51+
2652
class PolynomialFeatures(TransformerMixin, BaseEstimator):
2753
"""Generate polynomial and interaction features.
2854
@@ -249,12 +275,13 @@ def transform(self, X):
249275
to_stack.append(np.ones(shape=(n_samples, 1), dtype=X.dtype))
250276
to_stack.append(X)
251277
for deg in range(2, self.degree+1):
252-
# Dimensionality of the expanded space can become very
253-
# large so we cast X.shape[1] to int64 to avoid overflow
254-
# when computing expanded_dimensionality
278+
(X, d, expanded_dim
279+
) = _cast_to_int64_if_needed(X, deg,
280+
self.interaction_only)
255281
Xp_next = _csr_polynomial_expansion(X.data, X.indices,
256282
X.indptr,
257-
np.int64(X.shape[1]),
283+
d,
284+
expanded_dim,
258285
self.interaction_only,
259286
deg)
260287
if Xp_next is None:

0 commit comments

Comments
 (0)