Skip to content

Commit d0b7985

Browse files
committed
Partially addressed disregard NaN capabilities for Sparse Matrix in StandardScaler
1 parent 79053af commit d0b7985

File tree

5 files changed

+241
-28
lines changed

5 files changed

+241
-28
lines changed

sklearn/preprocessing/data.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
from ..utils.extmath import _incremental_mean_and_var
2727
from ..utils.fixes import _argmax
2828
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
29-
inplace_csr_row_normalize_l2)
29+
inplace_csr_row_normalize_l2,
30+
n_samples_count_csc, n_samples_count_csr)
3031
from ..utils.sparsefuncs import (inplace_column_scale,
3132
mean_variance_axis, incr_mean_variance_axis,
3233
min_max_axis)
@@ -619,7 +620,8 @@ def partial_fit(self, X, y=None):
619620
Ignored
620621
"""
621622
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
622-
warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES)
623+
warn_on_dtype=True, estimator=self,
624+
force_all_finite='allow-nan', dtype=FLOAT_DTYPES)
623625

624626
# Even in the case of `with_mean=False`, we update the mean anyway
625627
# This is needed for the incremental computation of the var
@@ -634,14 +636,19 @@ def partial_fit(self, X, y=None):
634636
# First pass
635637
if not hasattr(self, 'n_samples_seen_'):
636638
self.mean_, self.var_ = mean_variance_axis(X, axis=0)
637-
self.n_samples_seen_ = X.shape[0]
639+
if isinstance (X, sparse.csc_matrix):
640+
self.n_samples_seen_ = n_samples_count_csc(X.data, X.shape, X.indices, X.indptr)
641+
else:
642+
self.n_samples_seen_ = n_samples_count_csr(X.data, X.shape, X.indices)
643+
638644
# Next passes
639645
else:
640646
self.mean_, self.var_, self.n_samples_seen_ = \
641647
incr_mean_variance_axis(X, axis=0,
642648
last_mean=self.mean_,
643649
last_var=self.var_,
644-
last_n=self.n_samples_seen_)
650+
last_n=0,
651+
last_n_feat=self.n_samples_seen_)
645652
else:
646653
self.mean_ = None
647654
self.var_ = None
@@ -688,7 +695,8 @@ def transform(self, X, y='deprecated', copy=None):
688695

689696
copy = copy if copy is not None else self.copy
690697
X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=True,
691-
estimator=self, dtype=FLOAT_DTYPES)
698+
estimator=self, dtype=FLOAT_DTYPES,
699+
force_all_finite='allow-nan')
692700

693701
if sparse.issparse(X):
694702
if self.with_mean:

sklearn/utils/estimator_checks.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression',
7171
'RANSACRegressor', 'RadiusNeighborsRegressor',
7272
'RandomForestRegressor', 'Ridge', 'RidgeCV']
73+
ALLOW_NAN = ['StandardScaler']
7374

7475

7576
def _yield_non_meta_checks(name, estimator):
@@ -1024,6 +1025,8 @@ def check_estimators_nan_inf(name, estimator_orig):
10241025
error_string_transform = ("Estimator doesn't check for NaN and inf in"
10251026
" transform.")
10261027
for X_train in [X_train_nan, X_train_inf]:
1028+
if np.any(np.isnan(X_train)) and name in ALLOW_NAN:
1029+
continue
10271030
# catch deprecation warnings
10281031
with ignore_warnings(category=(DeprecationWarning, FutureWarning)):
10291032
estimator = clone(estimator_orig)

sklearn/utils/sparsefuncs.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ def mean_variance_axis(X, axis):
9999
_raise_typeerror(X)
100100

101101

102-
def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n):
102+
def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n,
103+
last_n_feat=np.array([0], dtype=np.uint32)):
103104
"""Compute incremental mean and variance along an axix on a CSR or
104105
CSC matrix.
105106
@@ -143,17 +144,21 @@ def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n):
143144
if isinstance(X, sp.csr_matrix):
144145
if axis == 0:
145146
return _incr_mean_var_axis0(X, last_mean=last_mean,
146-
last_var=last_var, last_n=last_n)
147+
last_var=last_var, last_n=last_n,
148+
last_n_feat=last_n_feat)
147149
else:
148150
return _incr_mean_var_axis0(X.T, last_mean=last_mean,
149-
last_var=last_var, last_n=last_n)
151+
last_var=last_var, last_n=last_n,
152+
last_n_feat=last_n_feat)
150153
elif isinstance(X, sp.csc_matrix):
151154
if axis == 0:
152155
return _incr_mean_var_axis0(X, last_mean=last_mean,
153-
last_var=last_var, last_n=last_n)
156+
last_var=last_var, last_n=last_n,
157+
last_n_feat=last_n_feat)
154158
else:
155159
return _incr_mean_var_axis0(X.T, last_mean=last_mean,
156-
last_var=last_var, last_n=last_n)
160+
last_var=last_var, last_n=last_n,
161+
last_n_feat=last_n_feat)
157162
else:
158163
_raise_typeerror(X)
159164

sklearn/utils/sparsefuncs_fast.pyx

Lines changed: 132 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#!python
1010
#cython: boundscheck=False, wraparound=False, cdivision=True
1111

12-
from libc.math cimport fabs, sqrt, pow
12+
from libc.math cimport fabs, sqrt, pow, isnan
1313
cimport numpy as np
1414
import numpy as np
1515
import scipy.sparse as sp
@@ -79,7 +79,8 @@ def csr_mean_variance_axis0(X):
7979

8080
def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
8181
shape,
82-
np.ndarray[int, ndim=1] X_indices):
82+
np.ndarray[int, ndim=1] X_indices,
83+
ignore_nan=True):
8384
# Implement the function here since variables using fused types
8485
# cannot be declared directly and can only be passed as function arguments
8586
cdef unsigned int n_samples = shape[0]
@@ -94,6 +95,8 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
9495
cdef np.ndarray[floating, ndim=1] means
9596
# variances[j] contains the variance of feature j
9697
cdef np.ndarray[floating, ndim=1] variances
98+
# n_samples_feat[j] contains the number of Non-NaN values of feature j
99+
cdef np.ndarray[floating, ndim=1] n_samples_feat
97100

98101
if floating is float:
99102
dtype = np.float32
@@ -102,26 +105,37 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
102105

103106
means = np.zeros(n_features, dtype=dtype)
104107
variances = np.zeros_like(means, dtype=dtype)
108+
n_samples_feat = np.ones_like(means, dtype=dtype) * n_samples
105109

106110
# counts[j] contains the number of samples where feature j is non-zero
107111
cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features,
108112
dtype=np.int32)
109113

110114
for i in xrange(non_zero):
111115
col_ind = X_indices[i]
112-
means[col_ind] += X_data[i]
116+
x_i = X_data[i]
117+
if isnan(x_i) and ignore_nan:
118+
n_samples_feat[col_ind] -= 1
119+
continue
120+
means[col_ind] += x_i
113121

114-
means /= n_samples
122+
for i in xrange(n_features):
123+
# Avoid division by Zero in cases when all column elements are NaN
124+
if n_samples_feat[i]:
125+
means[i] /= n_samples_feat[i]
115126

116127
for i in xrange(non_zero):
117128
col_ind = X_indices[i]
118-
diff = X_data[i] - means[col_ind]
129+
x_i = X_data[i]
130+
if isnan(x_i) and ignore_nan:
131+
continue
132+
diff = x_i - means[col_ind]
119133
variances[col_ind] += diff * diff
120134
counts[col_ind] += 1
121135

122136
for i in xrange(n_features):
123-
variances[i] += (n_samples - counts[i]) * means[i] ** 2
124-
variances[i] /= n_samples
137+
variances[i] += (n_samples_feat[i] - counts[i]) * means[i] ** 2
138+
variances[i] /= n_samples_feat[i]
125139

126140
return means, variances
127141

@@ -152,7 +166,8 @@ def csc_mean_variance_axis0(X):
152166
def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
153167
shape,
154168
np.ndarray[int, ndim=1] X_indices,
155-
np.ndarray[int, ndim=1] X_indptr):
169+
np.ndarray[int, ndim=1] X_indptr,
170+
ignore_nan=True):
156171
# Implement the function here since variables using fused types
157172
# cannot be declared directly and can only be passed as function arguments
158173
cdef unsigned int n_samples = shape[0]
@@ -163,6 +178,7 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
163178
cdef unsigned int counts
164179
cdef unsigned int startptr
165180
cdef unsigned int endptr
181+
cdef unsigned int n_samples_feat
166182
cdef floating diff
167183

168184
# means[j] contains the mean of feature j
@@ -182,22 +198,80 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
182198
startptr = X_indptr[i]
183199
endptr = X_indptr[i + 1]
184200
counts = endptr - startptr
201+
n_samples_feat = n_samples
185202

186203
for j in xrange(startptr, endptr):
187-
means[i] += X_data[j]
188-
means[i] /= n_samples
204+
x_i = X_data[j]
205+
if isnan(x_i) and ignore_nan:
206+
n_samples_feat -= 1
207+
continue
208+
means[i] += x_i
209+
# Avoid division by Zero in case where all values are NaN in feature i
210+
if n_samples_feat:
211+
means[i] /= n_samples_feat
189212

190213
for j in xrange(startptr, endptr):
191-
diff = X_data[j] - means[i]
214+
x_i = X_data[j]
215+
if isnan(x_i) and ignore_nan:
216+
continue
217+
diff = x_i - means[i]
192218
variances[i] += diff * diff
193219

194-
variances[i] += (n_samples - counts) * means[i] * means[i]
195-
variances[i] /= n_samples
220+
variances[i] += (n_samples_feat - counts) * means[i] * means[i]
221+
variances[i] /= n_samples_feat
196222

197223
return means, variances
198224

225+
def n_samples_count_csc(np.ndarray[floating, ndim=1] X_data,
226+
shape,
227+
np.ndarray[int, ndim=1] X_indices,
228+
np.ndarray[int, ndim=1] X_indptr):
229+
cdef unsigned int n_samples = shape[0]
230+
cdef unsigned int n_features = shape[1]
231+
cdef unsigned int startptr
232+
cdef unsigned int endptr
233+
cdef unsigned int i
234+
cdef unsigned int j
235+
236+
cdef np.ndarray[unsigned int, ndim=1] n_samples_feat
237+
238+
n_samples_feat = np.ones(n_features, dtype=np.uint32) * n_samples
239+
240+
for i in xrange(n_features):
241+
startptr = X_indptr[i]
242+
endptr = X_indptr[i+1]
243+
244+
for j in xrange(startptr, endptr):
245+
if isnan(X_data[j]):
246+
n_samples_feat[i] -= 1
247+
248+
return n_samples_feat
199249

200-
def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
250+
def n_samples_count_csr(np.ndarray[floating, ndim=1, mode="c"] X_data,
251+
shape,
252+
np.ndarray[int, ndim=1] X_indices):
253+
cdef unsigned int n_samples = shape[0]
254+
cdef unsigned int n_features = shape[1]
255+
256+
cdef unsigned int i
257+
cdef unsigned int non_zero = X_indices.shape[0]
258+
cdef unsigned int col_ind
259+
260+
cdef np.ndarray[unsigned int, ndim=1] n_samples_feat
261+
262+
n_samples_feat = np.ones(n_features, dtype=np.uint32) * n_samples
263+
264+
for i in xrange(non_zero):
265+
col_ind = X_indices[i]
266+
x_i = X_data[i]
267+
if isnan(x_i):
268+
n_samples_feat[col_ind] -= 1
269+
270+
return n_samples_feat
271+
272+
273+
def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n,
274+
last_n_feat=np.array([0], dtype=np.uint32)):
201275
"""Compute mean and variance along axis 0 on a CSR or CSC matrix.
202276
203277
last_mean, last_var are the statistics computed at the last step by this
@@ -244,7 +318,8 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
244318
if X.dtype != np.float32:
245319
X = X.astype(np.float64)
246320
return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr,
247-
X.format, last_mean, last_var, last_n)
321+
X.format, last_mean, last_var, last_n,
322+
last_n_feat)
248323

249324

250325
def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
@@ -254,7 +329,8 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
254329
X_format,
255330
last_mean,
256331
last_var,
257-
unsigned long last_n):
332+
unsigned long last_n,
333+
np.ndarray[unsigned int, ndim=1] last_n_feat):
258334
# Implement the function here since variables using fused types
259335
# cannot be declared directly and can only be passed as function arguments
260336
cdef unsigned long n_samples = shape[0]
@@ -280,24 +356,63 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
280356
updated_var = np.zeros_like(new_mean, dtype=dtype)
281357

282358
cdef unsigned long new_n
359+
cdef np.ndarray[unsigned int, ndim=1] new_n_feat
283360
cdef unsigned long updated_n
361+
cdef np.ndarray[unsigned int, ndim=1] updated_n_feat
284362
cdef floating last_over_new_n
285363

286364
# Obtain new stats first
287365
new_n = n_samples
288366

289367
if X_format == 'csr':
290368
# X is a CSR matrix
369+
new_n_feat = n_samples_count_csr(X_data, shape, X_indices)
291370
new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices)
292371
else:
293372
# X is a CSC matrix
373+
new_n_feat = n_samples_count_csc(X_data, shape, X_indices, X_indptr)
294374
new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices,
295375
X_indptr)
376+
new_n = new_n_feat[0]
296377

297378
# First pass
298-
if last_n == 0:
379+
if last_n == 0 and (last_n_feat==0).all():
299380
return new_mean, new_var, new_n
300381
# Next passes
382+
383+
# Where each feature has different values and updated_n_feat is a vector
384+
elif last_n==0 and (last_n_feat!=0).any():
385+
updated_n_feat = last_n_feat + new_n_feat
386+
387+
for i in xrange(n_features):
388+
if updated_n_feat[i] == 0:
389+
continue
390+
if new_n_feat[i] == 0:
391+
updated_mean[i] = last_mean[i]
392+
updated_var[i] = last_var[i]
393+
continue
394+
last_over_new_n = last_n_feat[i] * 1.0 / new_n_feat[i]
395+
# Unnormalized old stats
396+
last_mean[i] *= last_n_feat[i]
397+
last_var[i] *= last_n_feat[i]
398+
399+
# Unnormalized new stats
400+
new_mean[i] *= new_n_feat[i]
401+
new_var[i] *= new_n_feat[i]
402+
403+
# Update stats
404+
updated_var[i] = (last_var[i] + new_var[i] +
405+
last_over_new_n / updated_n_feat[i] *
406+
(last_mean[i] / last_over_new_n -
407+
new_mean[i]) ** 2)
408+
409+
updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n_feat[i]
410+
updated_var[i] = updated_var[i] / updated_n_feat[i]
411+
412+
return updated_mean, updated_var, updated_n_feat
413+
414+
415+
# Where updated_n is a scaler
301416
else:
302417
updated_n = last_n + new_n
303418
last_over_new_n = last_n / new_n

0 commit comments

Comments
 (0)