9
9
# !python
10
10
# cython: boundscheck=False, wraparound=False, cdivision=True
11
11
12
- from libc.math cimport fabs, sqrt, pow
12
+ from libc.math cimport fabs, sqrt, pow , isnan
13
13
cimport numpy as np
14
14
import numpy as np
15
15
import scipy.sparse as sp
@@ -79,7 +79,8 @@ def csr_mean_variance_axis0(X):
79
79
80
80
def _csr_mean_variance_axis0 (np.ndarray[floating , ndim = 1 , mode = " c" ] X_data,
81
81
shape ,
82
- np.ndarray[int , ndim = 1 ] X_indices):
82
+ np.ndarray[int , ndim = 1 ] X_indices,
83
+ ignore_nan = True ):
83
84
# Implement the function here since variables using fused types
84
85
# cannot be declared directly and can only be passed as function arguments
85
86
cdef unsigned int n_samples = shape[0 ]
@@ -94,6 +95,8 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
94
95
cdef np.ndarray[floating, ndim= 1 ] means
95
96
# variances[j] contains the variance of feature j
96
97
cdef np.ndarray[floating, ndim= 1 ] variances
98
+ # n_samples_feat[j] contains the number of Non-NaN values of feature j
99
+ cdef np.ndarray[floating, ndim= 1 ] n_samples_feat
97
100
98
101
if floating is float :
99
102
dtype = np.float32
@@ -102,26 +105,37 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data,
102
105
103
106
means = np.zeros(n_features, dtype = dtype)
104
107
variances = np.zeros_like(means, dtype = dtype)
108
+ n_samples_feat = np.ones_like(means, dtype = dtype) * n_samples
105
109
106
110
# counts[j] contains the number of samples where feature j is non-zero
107
111
cdef np.ndarray[int , ndim= 1 ] counts = np.zeros(n_features,
108
112
dtype = np.int32)
109
113
110
114
for i in xrange (non_zero):
111
115
col_ind = X_indices[i]
112
- means[col_ind] += X_data[i]
116
+ x_i = X_data[i]
117
+ if isnan(x_i) and ignore_nan:
118
+ n_samples_feat[col_ind] -= 1
119
+ continue
120
+ means[col_ind] += x_i
113
121
114
- means /= n_samples
122
+ for i in xrange (n_features):
123
+ # Avoid division by Zero in cases when all column elements are NaN
124
+ if n_samples_feat[i]:
125
+ means[i] /= n_samples_feat[i]
115
126
116
127
for i in xrange (non_zero):
117
128
col_ind = X_indices[i]
118
- diff = X_data[i] - means[col_ind]
129
+ x_i = X_data[i]
130
+ if isnan(x_i) and ignore_nan:
131
+ continue
132
+ diff = x_i - means[col_ind]
119
133
variances[col_ind] += diff * diff
120
134
counts[col_ind] += 1
121
135
122
136
for i in xrange (n_features):
123
- variances[i] += (n_samples - counts[i]) * means[i] ** 2
124
- variances[i] /= n_samples
137
+ variances[i] += (n_samples_feat[i] - counts[i]) * means[i] ** 2
138
+ variances[i] /= n_samples_feat[i]
125
139
126
140
return means, variances
127
141
@@ -152,7 +166,8 @@ def csc_mean_variance_axis0(X):
152
166
def _csc_mean_variance_axis0 (np.ndarray[floating , ndim = 1 ] X_data,
153
167
shape ,
154
168
np.ndarray[int , ndim = 1 ] X_indices,
155
- np.ndarray[int , ndim = 1 ] X_indptr):
169
+ np.ndarray[int , ndim = 1 ] X_indptr,
170
+ ignore_nan = True ):
156
171
# Implement the function here since variables using fused types
157
172
# cannot be declared directly and can only be passed as function arguments
158
173
cdef unsigned int n_samples = shape[0 ]
@@ -163,6 +178,7 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
163
178
cdef unsigned int counts
164
179
cdef unsigned int startptr
165
180
cdef unsigned int endptr
181
+ cdef unsigned int n_samples_feat
166
182
cdef floating diff
167
183
168
184
# means[j] contains the mean of feature j
@@ -182,22 +198,80 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
182
198
startptr = X_indptr[i]
183
199
endptr = X_indptr[i + 1 ]
184
200
counts = endptr - startptr
201
+ n_samples_feat = n_samples
185
202
186
203
for j in xrange (startptr, endptr):
187
- means[i] += X_data[j]
188
- means[i] /= n_samples
204
+ x_i = X_data[j]
205
+ if isnan(x_i) and ignore_nan:
206
+ n_samples_feat -= 1
207
+ continue
208
+ means[i] += x_i
209
+ # Avoid division by Zero in case where all values are NaN in feature i
210
+ if n_samples_feat:
211
+ means[i] /= n_samples_feat
189
212
190
213
for j in xrange (startptr, endptr):
191
- diff = X_data[j] - means[i]
214
+ x_i = X_data[j]
215
+ if isnan(x_i) and ignore_nan:
216
+ continue
217
+ diff = x_i - means[i]
192
218
variances[i] += diff * diff
193
219
194
- variances[i] += (n_samples - counts) * means[i] * means[i]
195
- variances[i] /= n_samples
220
+ variances[i] += (n_samples_feat - counts) * means[i] * means[i]
221
+ variances[i] /= n_samples_feat
196
222
197
223
return means, variances
198
224
225
+ def n_samples_count_csc (np.ndarray[floating , ndim = 1 ] X_data,
226
+ shape ,
227
+ np.ndarray[int , ndim = 1 ] X_indices,
228
+ np.ndarray[int , ndim = 1 ] X_indptr):
229
+ cdef unsigned int n_samples = shape[0 ]
230
+ cdef unsigned int n_features = shape[1 ]
231
+ cdef unsigned int startptr
232
+ cdef unsigned int endptr
233
+ cdef unsigned int i
234
+ cdef unsigned int j
235
+
236
+ cdef np.ndarray[unsigned int , ndim= 1 ] n_samples_feat
237
+
238
+ n_samples_feat = np.ones(n_features, dtype = np.uint32) * n_samples
239
+
240
+ for i in xrange (n_features):
241
+ startptr = X_indptr[i]
242
+ endptr = X_indptr[i+ 1 ]
243
+
244
+ for j in xrange (startptr, endptr):
245
+ if isnan(X_data[j]):
246
+ n_samples_feat[i] -= 1
247
+
248
+ return n_samples_feat
199
249
200
- def incr_mean_variance_axis0 (X , last_mean , last_var , unsigned long last_n ):
250
+ def n_samples_count_csr (np.ndarray[floating , ndim = 1 , mode = " c" ] X_data,
251
+ shape ,
252
+ np.ndarray[int , ndim = 1 ] X_indices):
253
+ cdef unsigned int n_samples = shape[0 ]
254
+ cdef unsigned int n_features = shape[1 ]
255
+
256
+ cdef unsigned int i
257
+ cdef unsigned int non_zero = X_indices.shape[0 ]
258
+ cdef unsigned int col_ind
259
+
260
+ cdef np.ndarray[unsigned int , ndim= 1 ] n_samples_feat
261
+
262
+ n_samples_feat = np.ones(n_features, dtype = np.uint32) * n_samples
263
+
264
+ for i in xrange (non_zero):
265
+ col_ind = X_indices[i]
266
+ x_i = X_data[i]
267
+ if isnan(x_i):
268
+ n_samples_feat[col_ind] -= 1
269
+
270
+ return n_samples_feat
271
+
272
+
273
+ def incr_mean_variance_axis0 (X , last_mean , last_var , unsigned long last_n ,
274
+ last_n_feat = np.array([0 ], dtype = np.uint32)):
201
275
""" Compute mean and variance along axis 0 on a CSR or CSC matrix.
202
276
203
277
last_mean, last_var are the statistics computed at the last step by this
@@ -244,7 +318,8 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n):
244
318
if X.dtype != np.float32:
245
319
X = X.astype(np.float64)
246
320
return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr,
247
- X.format, last_mean, last_var, last_n)
321
+ X.format, last_mean, last_var, last_n,
322
+ last_n_feat)
248
323
249
324
250
325
def _incr_mean_variance_axis0 (np.ndarray[floating , ndim = 1 ] X_data,
@@ -254,7 +329,8 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
254
329
X_format ,
255
330
last_mean ,
256
331
last_var ,
257
- unsigned long last_n ):
332
+ unsigned long last_n ,
333
+ np.ndarray[unsigned int , ndim = 1 ] last_n_feat):
258
334
# Implement the function here since variables using fused types
259
335
# cannot be declared directly and can only be passed as function arguments
260
336
cdef unsigned long n_samples = shape[0 ]
@@ -280,24 +356,63 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data,
280
356
updated_var = np.zeros_like(new_mean, dtype = dtype)
281
357
282
358
cdef unsigned long new_n
359
+ cdef np.ndarray[unsigned int , ndim= 1 ] new_n_feat
283
360
cdef unsigned long updated_n
361
+ cdef np.ndarray[unsigned int , ndim= 1 ] updated_n_feat
284
362
cdef floating last_over_new_n
285
363
286
364
# Obtain new stats first
287
365
new_n = n_samples
288
366
289
367
if X_format == ' csr' :
290
368
# X is a CSR matrix
369
+ new_n_feat = n_samples_count_csr(X_data, shape, X_indices)
291
370
new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices)
292
371
else :
293
372
# X is a CSC matrix
373
+ new_n_feat = n_samples_count_csc(X_data, shape, X_indices, X_indptr)
294
374
new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices,
295
375
X_indptr)
376
+ new_n = new_n_feat[0 ]
296
377
297
378
# First pass
298
- if last_n == 0 :
379
+ if last_n == 0 and (last_n_feat == 0 ).all() :
299
380
return new_mean, new_var, new_n
300
381
# Next passes
382
+
383
+ # Where each feature has different values and updated_n_feat is a vector
384
+ elif last_n== 0 and (last_n_feat!= 0 ).any():
385
+ updated_n_feat = last_n_feat + new_n_feat
386
+
387
+ for i in xrange (n_features):
388
+ if updated_n_feat[i] == 0 :
389
+ continue
390
+ if new_n_feat[i] == 0 :
391
+ updated_mean[i] = last_mean[i]
392
+ updated_var[i] = last_var[i]
393
+ continue
394
+ last_over_new_n = last_n_feat[i] * 1.0 / new_n_feat[i]
395
+ # Unnormalized old stats
396
+ last_mean[i] *= last_n_feat[i]
397
+ last_var[i] *= last_n_feat[i]
398
+
399
+ # Unnormalized new stats
400
+ new_mean[i] *= new_n_feat[i]
401
+ new_var[i] *= new_n_feat[i]
402
+
403
+ # Update stats
404
+ updated_var[i] = (last_var[i] + new_var[i] +
405
+ last_over_new_n / updated_n_feat[i] *
406
+ (last_mean[i] / last_over_new_n -
407
+ new_mean[i]) ** 2 )
408
+
409
+ updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n_feat[i]
410
+ updated_var[i] = updated_var[i] / updated_n_feat[i]
411
+
412
+ return updated_mean, updated_var, updated_n_feat
413
+
414
+
415
+ # Where updated_n is a scaler
301
416
else :
302
417
updated_n = last_n + new_n
303
418
last_over_new_n = last_n / new_n
0 commit comments