From d0b79853eb872d5300ab5cd54ec2f8bd6e7dda9d Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 2 Feb 2018 10:08:59 +0530 Subject: [PATCH 01/11] Partially addressed disregard NaN capabilities for Sparse Matrix in StandardScaler --- sklearn/preprocessing/data.py | 18 ++- sklearn/utils/estimator_checks.py | 3 + sklearn/utils/sparsefuncs.py | 15 ++- sklearn/utils/sparsefuncs_fast.pyx | 149 +++++++++++++++++++++--- sklearn/utils/tests/test_sparsefuncs.py | 84 ++++++++++++- 5 files changed, 241 insertions(+), 28 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bbd2fae10c0ec..f0736aaf2f7d7 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -26,7 +26,8 @@ from ..utils.extmath import _incremental_mean_and_var from ..utils.fixes import _argmax from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1, - inplace_csr_row_normalize_l2) + inplace_csr_row_normalize_l2, + n_samples_count_csc, n_samples_count_csr) from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis, incr_mean_variance_axis, min_max_axis) @@ -619,7 +620,8 @@ def partial_fit(self, X, y=None): Ignored """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) + warn_on_dtype=True, estimator=self, + force_all_finite='allow-nan', dtype=FLOAT_DTYPES) # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -634,14 +636,19 @@ def partial_fit(self, X, y=None): # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_, self.var_ = mean_variance_axis(X, axis=0) - self.n_samples_seen_ = X.shape[0] + if isinstance (X, sparse.csc_matrix): + self.n_samples_seen_ = n_samples_count_csc(X.data, X.shape, X.indices, X.indptr) + else: + self.n_samples_seen_ = n_samples_count_csr(X.data, X.shape, X.indices) + # Next passes else: self.mean_, self.var_, self.n_samples_seen_ = \ incr_mean_variance_axis(X, axis=0, last_mean=self.mean_, last_var=self.var_, - last_n=self.n_samples_seen_) + last_n=0, + last_n_feat=self.n_samples_seen_) else: self.mean_ = None self.var_ = None @@ -688,7 +695,8 @@ def transform(self, X, y='deprecated', copy=None): copy = copy if copy is not None else self.copy X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=True, - estimator=self, dtype=FLOAT_DTYPES) + estimator=self, dtype=FLOAT_DTYPES, + force_all_finite='allow-nan') if sparse.issparse(X): if self.with_mean: diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 0658eca0a371f..15faae83911ab 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -70,6 +70,7 @@ 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression', 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] +ALLOW_NAN = ['StandardScaler'] def _yield_non_meta_checks(name, estimator): @@ -1024,6 +1025,8 @@ def check_estimators_nan_inf(name, estimator_orig): error_string_transform = ("Estimator doesn't check for NaN and inf in" " transform.") for X_train in [X_train_nan, X_train_inf]: + if np.any(np.isnan(X_train)) and name in ALLOW_NAN: + continue # catch deprecation warnings with ignore_warnings(category=(DeprecationWarning, FutureWarning)): estimator = clone(estimator_orig) diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py index 38b8b0a6eff16..ee24f1c25b09c 100644 --- a/sklearn/utils/sparsefuncs.py +++ b/sklearn/utils/sparsefuncs.py @@ -99,7 +99,8 @@ def mean_variance_axis(X, axis): _raise_typeerror(X) -def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n): +def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n, + last_n_feat=np.array([0], dtype=np.uint32)): """Compute incremental mean and variance along an axix on a CSR or CSC matrix. @@ -143,17 +144,21 @@ def incr_mean_variance_axis(X, axis, last_mean, last_var, last_n): if isinstance(X, sp.csr_matrix): if axis == 0: return _incr_mean_var_axis0(X, last_mean=last_mean, - last_var=last_var, last_n=last_n) + last_var=last_var, last_n=last_n, + last_n_feat=last_n_feat) else: return _incr_mean_var_axis0(X.T, last_mean=last_mean, - last_var=last_var, last_n=last_n) + last_var=last_var, last_n=last_n, + last_n_feat=last_n_feat) elif isinstance(X, sp.csc_matrix): if axis == 0: return _incr_mean_var_axis0(X, last_mean=last_mean, - last_var=last_var, last_n=last_n) + last_var=last_var, last_n=last_n, + last_n_feat=last_n_feat) else: return _incr_mean_var_axis0(X.T, last_mean=last_mean, - last_var=last_var, last_n=last_n) + last_var=last_var, last_n=last_n, + last_n_feat=last_n_feat) else: _raise_typeerror(X) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 52c12ce5d5953..adec0a38af0ec 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -9,7 +9,7 @@ #!python #cython: boundscheck=False, wraparound=False, cdivision=True -from libc.math cimport fabs, sqrt, pow +from libc.math cimport fabs, sqrt, pow, isnan cimport numpy as np import numpy as np import scipy.sparse as sp @@ -79,7 +79,8 @@ def csr_mean_variance_axis0(X): def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, shape, - np.ndarray[int, ndim=1] X_indices): + np.ndarray[int, ndim=1] X_indices, + ignore_nan=True): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef unsigned int n_samples = shape[0] @@ -94,6 +95,8 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, cdef np.ndarray[floating, ndim=1] means # variances[j] contains the variance of feature j cdef np.ndarray[floating, ndim=1] variances + # n_samples_feat[j] contains the number of Non-NaN values of feature j + cdef np.ndarray[floating, ndim=1] n_samples_feat if floating is float: dtype = np.float32 @@ -102,6 +105,7 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, means = np.zeros(n_features, dtype=dtype) variances = np.zeros_like(means, dtype=dtype) + n_samples_feat = np.ones_like(means, dtype=dtype) * n_samples # counts[j] contains the number of samples where feature j is non-zero cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features, @@ -109,19 +113,29 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, for i in xrange(non_zero): col_ind = X_indices[i] - means[col_ind] += X_data[i] + x_i = X_data[i] + if isnan(x_i) and ignore_nan: + n_samples_feat[col_ind] -= 1 + continue + means[col_ind] += x_i - means /= n_samples + for i in xrange(n_features): + # Avoid division by Zero in cases when all column elements are NaN + if n_samples_feat[i]: + means[i] /= n_samples_feat[i] for i in xrange(non_zero): col_ind = X_indices[i] - diff = X_data[i] - means[col_ind] + x_i = X_data[i] + if isnan(x_i) and ignore_nan: + continue + diff = x_i - means[col_ind] variances[col_ind] += diff * diff counts[col_ind] += 1 for i in xrange(n_features): - variances[i] += (n_samples - counts[i]) * means[i] ** 2 - variances[i] /= n_samples + variances[i] += (n_samples_feat[i] - counts[i]) * means[i] ** 2 + variances[i] /= n_samples_feat[i] return means, variances @@ -152,7 +166,8 @@ def csc_mean_variance_axis0(X): def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, shape, np.ndarray[int, ndim=1] X_indices, - np.ndarray[int, ndim=1] X_indptr): + np.ndarray[int, ndim=1] X_indptr, + ignore_nan=True): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef unsigned int n_samples = shape[0] @@ -163,6 +178,7 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, cdef unsigned int counts cdef unsigned int startptr cdef unsigned int endptr + cdef unsigned int n_samples_feat cdef floating diff # means[j] contains the mean of feature j @@ -182,22 +198,80 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, startptr = X_indptr[i] endptr = X_indptr[i + 1] counts = endptr - startptr + n_samples_feat = n_samples for j in xrange(startptr, endptr): - means[i] += X_data[j] - means[i] /= n_samples + x_i = X_data[j] + if isnan(x_i) and ignore_nan: + n_samples_feat -= 1 + continue + means[i] += x_i + # Avoid division by Zero in case where all values are NaN in feature i + if n_samples_feat: + means[i] /= n_samples_feat for j in xrange(startptr, endptr): - diff = X_data[j] - means[i] + x_i = X_data[j] + if isnan(x_i) and ignore_nan: + continue + diff = x_i - means[i] variances[i] += diff * diff - variances[i] += (n_samples - counts) * means[i] * means[i] - variances[i] /= n_samples + variances[i] += (n_samples_feat - counts) * means[i] * means[i] + variances[i] /= n_samples_feat return means, variances +def n_samples_count_csc(np.ndarray[floating, ndim=1] X_data, + shape, + np.ndarray[int, ndim=1] X_indices, + np.ndarray[int, ndim=1] X_indptr): + cdef unsigned int n_samples = shape[0] + cdef unsigned int n_features = shape[1] + cdef unsigned int startptr + cdef unsigned int endptr + cdef unsigned int i + cdef unsigned int j + + cdef np.ndarray[unsigned int, ndim=1] n_samples_feat + + n_samples_feat = np.ones(n_features, dtype=np.uint32) * n_samples + + for i in xrange(n_features): + startptr = X_indptr[i] + endptr = X_indptr[i+1] + + for j in xrange(startptr, endptr): + if isnan(X_data[j]): + n_samples_feat[i] -= 1 + + return n_samples_feat -def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n): +def n_samples_count_csr(np.ndarray[floating, ndim=1, mode="c"] X_data, + shape, + np.ndarray[int, ndim=1] X_indices): + cdef unsigned int n_samples = shape[0] + cdef unsigned int n_features = shape[1] + + cdef unsigned int i + cdef unsigned int non_zero = X_indices.shape[0] + cdef unsigned int col_ind + + cdef np.ndarray[unsigned int, ndim=1] n_samples_feat + + n_samples_feat = np.ones(n_features, dtype=np.uint32) * n_samples + + for i in xrange(non_zero): + col_ind = X_indices[i] + x_i = X_data[i] + if isnan(x_i): + n_samples_feat[col_ind] -= 1 + + return n_samples_feat + + +def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n, + last_n_feat=np.array([0], dtype=np.uint32)): """Compute mean and variance along axis 0 on a CSR or CSC matrix. last_mean, last_var are the statistics computed at the last step by this @@ -244,7 +318,8 @@ def incr_mean_variance_axis0(X, last_mean, last_var, unsigned long last_n): if X.dtype != np.float32: X = X.astype(np.float64) return _incr_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr, - X.format, last_mean, last_var, last_n) + X.format, last_mean, last_var, last_n, + last_n_feat) def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, @@ -254,7 +329,8 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, X_format, last_mean, last_var, - unsigned long last_n): + unsigned long last_n, + np.ndarray[unsigned int, ndim=1] last_n_feat): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef unsigned long n_samples = shape[0] @@ -280,7 +356,9 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, updated_var = np.zeros_like(new_mean, dtype=dtype) cdef unsigned long new_n + cdef np.ndarray[unsigned int, ndim=1] new_n_feat cdef unsigned long updated_n + cdef np.ndarray[unsigned int, ndim=1] updated_n_feat cdef floating last_over_new_n # Obtain new stats first @@ -288,16 +366,53 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, if X_format == 'csr': # X is a CSR matrix + new_n_feat = n_samples_count_csr(X_data, shape, X_indices) new_mean, new_var = _csr_mean_variance_axis0(X_data, shape, X_indices) else: # X is a CSC matrix + new_n_feat = n_samples_count_csc(X_data, shape, X_indices, X_indptr) new_mean, new_var = _csc_mean_variance_axis0(X_data, shape, X_indices, X_indptr) + new_n = new_n_feat[0] # First pass - if last_n == 0: + if last_n == 0 and (last_n_feat==0).all(): return new_mean, new_var, new_n # Next passes + + # Where each feature has different values and updated_n_feat is a vector + elif last_n==0 and (last_n_feat!=0).any(): + updated_n_feat = last_n_feat + new_n_feat + + for i in xrange(n_features): + if updated_n_feat[i] == 0: + continue + if new_n_feat[i] == 0: + updated_mean[i] = last_mean[i] + updated_var[i] = last_var[i] + continue + last_over_new_n = last_n_feat[i] * 1.0 / new_n_feat[i] + # Unnormalized old stats + last_mean[i] *= last_n_feat[i] + last_var[i] *= last_n_feat[i] + + # Unnormalized new stats + new_mean[i] *= new_n_feat[i] + new_var[i] *= new_n_feat[i] + + # Update stats + updated_var[i] = (last_var[i] + new_var[i] + + last_over_new_n / updated_n_feat[i] * + (last_mean[i] / last_over_new_n - + new_mean[i]) ** 2) + + updated_mean[i] = (last_mean[i] + new_mean[i]) / updated_n_feat[i] + updated_var[i] = updated_var[i] / updated_n_feat[i] + + return updated_mean, updated_var, updated_n_feat + + + # Where updated_n is a scaler else: updated_n = last_n + new_n last_over_new_n = last_n / new_n diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index f2b35e7459833..a70d503291314 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -21,6 +21,28 @@ from sklearn.utils.testing import assert_raises +def test_mean_variance_axis0_nan(): + X = np.zeros(10).reshape(5,2) * np.nan + X[0,0] = 1 + X_csr = sp.csr_matrix(X) + X_csc = sp.csc_matrix(X) + + expected_dtypes = [(np.float32, np.float32), + (np.float64, np.float64), + (np.int32, np.float64), + (np.int64, np.float64)] + + for input_dtype, output_dtype in expected_dtypes: + + for X_sparse in (X_csr, X_csc): + X_sparse = X_sparse.astype(input_dtype) + X_means, X_vars = mean_variance_axis(X_sparse, axis=0) + assert_equal(X_means.dtype, output_dtype) + assert_equal(X_vars.dtype, output_dtype) + np.allclose(X_means, np.array([1, np.nan], dtype=output_dtype)) + np.allclose(X_vars, np.array([0, np.nan], dtype=output_dtype)) + + def test_mean_variance_axis0(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit @@ -82,6 +104,67 @@ def test_mean_variance_axis1(): assert_array_almost_equal(X_means, np.mean(X_test, axis=0)) assert_array_almost_equal(X_vars, np.var(X_test, axis=0)) +def test_incr_mean_variance_axis_nan(): + for axis in [0, 1]: + n_features = 50 + data_chunk = np.random.randint(0,2, size=500).reshape(50,10) + data_chunk = np.array(data_chunk, dtype=np.float) + data_chunk[0,0] = np.nan + + # default params for incr_mean_variance + last_mean = np.zeros(n_features) + last_var = np.zeros_like(last_mean) + last_n = np.zeros_like(last_mean, dtype=np.uint32) + + # Test errors + X = data_chunk[0] + X = np.atleast_2d(X) + X_lil = sp.lil_matrix(X) + X_csr = sp.csr_matrix(X_lil) + assert_raises(TypeError, incr_mean_variance_axis, axis, + last_mean, last_var, last_n) + assert_raises(TypeError, incr_mean_variance_axis, axis, + last_mean, last_var, last_n) + assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, + last_mean, last_var, last_n) + + # Test _incr_mean_and_var with a 1 row input + X_means, X_vars = mean_variance_axis(X_csr, axis) + X_means_incr, X_vars_incr, n_incr = \ + incr_mean_variance_axis(X_csr, axis, last_mean, last_var, + last_n=0, last_n_feat=last_n) + np.allclose(X_means, X_means_incr) + np.allclose(X_vars, X_vars_incr) + + X_csc = sp.csc_matrix(X_lil) + X_means, X_vars = mean_variance_axis(X_csc, axis) + np.allclose(X_means, X_means_incr) + np.allclose(X_vars, X_vars_incr) + + # Test _incremental_mean_and_var with whole data + X = data_chunk + X_lil = sp.lil_matrix(X) + X_csr = sp.csr_matrix(X_lil) + X_csc = sp.csc_matrix(X_lil) + + expected_dtypes = [(np.float32, np.float32), + (np.float64, np.float64), + (np.int32, np.float64), + (np.int64, np.float64)] + + for input_dtype, output_dtype in expected_dtypes: + for X_sparse in (X_csr, X_csc): + X_sparse = X_sparse.astype(input_dtype) + X_means, X_vars = mean_variance_axis(X_sparse, axis) + X_means_incr, X_vars_incr, n_incr = \ + incr_mean_variance_axis(X_sparse, axis, last_mean, + last_var, last_n=0, + last_n_feat=last_n) + assert_equal(X_means_incr.dtype, output_dtype) + assert_equal(X_vars_incr.dtype, output_dtype) + np.allclose(X_means, X_means_incr) + np.allclose(X_vars, X_vars_incr) + def test_incr_mean_variance_axis(): for axis in [0, 1]: @@ -146,7 +229,6 @@ def test_incr_mean_variance_axis(): assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) - def test_mean_variance_illegal_axis(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit From 110977da9fcaab29ee0acc26e3cdfe05e79b4a52 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 2 Feb 2018 11:10:28 +0530 Subject: [PATCH 02/11] removed pyflakes8 errors --- sklearn/preprocessing/data.py | 12 ++++++++---- sklearn/utils/sparsefuncs_fast.pyx | 13 +++++++------ sklearn/utils/tests/test_sparsefuncs.py | 10 ++++++---- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index f0736aaf2f7d7..fe86099869042 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -636,10 +636,13 @@ def partial_fit(self, X, y=None): # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_, self.var_ = mean_variance_axis(X, axis=0) - if isinstance (X, sparse.csc_matrix): - self.n_samples_seen_ = n_samples_count_csc(X.data, X.shape, X.indices, X.indptr) + if isinstance(X, sparse.csc_matrix): + self.n_samples_seen_ = \ + n_samples_count_csc(X.data, X.shape, + X.indices, X.indptr) else: - self.n_samples_seen_ = n_samples_count_csr(X.data, X.shape, X.indices) + self.n_samples_seen_ = \ + n_samples_count_csr(X.data, X.shape, X.indices) # Next passes else: @@ -648,7 +651,8 @@ def partial_fit(self, X, y=None): last_mean=self.mean_, last_var=self.var_, last_n=0, - last_n_feat=self.n_samples_seen_) + last_n_feat= \ + self.n_samples_seen_) else: self.mean_ = None self.var_ = None diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index adec0a38af0ec..41508dfa64b32 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -222,10 +222,11 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, return means, variances + def n_samples_count_csc(np.ndarray[floating, ndim=1] X_data, - shape, - np.ndarray[int, ndim=1] X_indices, - np.ndarray[int, ndim=1] X_indptr): + shape, + np.ndarray[int, ndim=1] X_indices, + np.ndarray[int, ndim=1] X_indptr): cdef unsigned int n_samples = shape[0] cdef unsigned int n_features = shape[1] cdef unsigned int startptr @@ -247,9 +248,10 @@ def n_samples_count_csc(np.ndarray[floating, ndim=1] X_data, return n_samples_feat + def n_samples_count_csr(np.ndarray[floating, ndim=1, mode="c"] X_data, - shape, - np.ndarray[int, ndim=1] X_indices): + shape, + np.ndarray[int, ndim=1] X_indices): cdef unsigned int n_samples = shape[0] cdef unsigned int n_features = shape[1] @@ -411,7 +413,6 @@ def _incr_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, return updated_mean, updated_var, updated_n_feat - # Where updated_n is a scaler else: updated_n = last_n + new_n diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index a70d503291314..c606a16849641 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -22,8 +22,8 @@ def test_mean_variance_axis0_nan(): - X = np.zeros(10).reshape(5,2) * np.nan - X[0,0] = 1 + X = np.zeros(10).reshape(5, 2) * np.nan + X[0, 0] = 1 X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) @@ -104,12 +104,13 @@ def test_mean_variance_axis1(): assert_array_almost_equal(X_means, np.mean(X_test, axis=0)) assert_array_almost_equal(X_vars, np.var(X_test, axis=0)) + def test_incr_mean_variance_axis_nan(): for axis in [0, 1]: n_features = 50 - data_chunk = np.random.randint(0,2, size=500).reshape(50,10) + data_chunk = np.random.randint(0, 2, size=500).reshape(50, 10) data_chunk = np.array(data_chunk, dtype=np.float) - data_chunk[0,0] = np.nan + data_chunk[0, 0] = np.nan # default params for incr_mean_variance last_mean = np.zeros(n_features) @@ -229,6 +230,7 @@ def test_incr_mean_variance_axis(): assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) + def test_mean_variance_illegal_axis(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit From f4adbb9c5e650e2dc68e5851b0fb56fb9c84152b Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 2 Feb 2018 11:46:51 +0530 Subject: [PATCH 03/11] removing pyflakes8 errors --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index fe86099869042..6f930f7a8547f 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -651,7 +651,7 @@ def partial_fit(self, X, y=None): last_mean=self.mean_, last_var=self.var_, last_n=0, - last_n_feat= \ + last_n_feat= self.n_samples_seen_) else: self.mean_ = None From a44b9bac737df55f871dc696c77a759ae3fab74a Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 2 Feb 2018 12:38:58 +0530 Subject: [PATCH 04/11] removing pep8 errors --- sklearn/preprocessing/data.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 6f930f7a8547f..6f46d5f000412 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -647,12 +647,12 @@ def partial_fit(self, X, y=None): # Next passes else: self.mean_, self.var_, self.n_samples_seen_ = \ - incr_mean_variance_axis(X, axis=0, - last_mean=self.mean_, - last_var=self.var_, - last_n=0, - last_n_feat= - self.n_samples_seen_) + incr_mean_variance_axis( + X, axis=0, + last_mean=self.mean_, + last_var=self.var_, + last_n=0, + last_n_feat=self.n_samples_seen_) else: self.mean_ = None self.var_ = None From ceb268db24b3d812e1f508039f8d4937c6e37200 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sat, 3 Feb 2018 22:10:13 +0530 Subject: [PATCH 05/11] adresses error encountered in appveyor --- sklearn/utils/sparsefuncs_fast.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 41508dfa64b32..bcef59fcc830d 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -15,6 +15,8 @@ import numpy as np import scipy.sparse as sp cimport cython from cython cimport floating +cdef extern from "numpy/npy_math.h" nogil: + bint isnan "npy_isnan"(long double) np.import_array() From 430231c7c15d7e7cb1eb1967cb84f795c36ae188 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 4 Feb 2018 00:45:38 +0530 Subject: [PATCH 06/11] addressing appveyor error --- sklearn/utils/sparsefuncs_fast.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index bcef59fcc830d..ff5465deb9adf 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -9,14 +9,14 @@ #!python #cython: boundscheck=False, wraparound=False, cdivision=True -from libc.math cimport fabs, sqrt, pow, isnan +from libc.math cimport fabs, sqrt, pow cimport numpy as np import numpy as np import scipy.sparse as sp cimport cython from cython cimport floating -cdef extern from "numpy/npy_math.h" nogil: - bint isnan "npy_isnan"(long double) + +from numpy.math cimport isnan np.import_array() From f98823f01ce989eae58d226c85d9d09b899481c0 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 9 Feb 2018 00:53:35 +0530 Subject: [PATCH 07/11] added test cases for _incremental_mean_and_var and sparse matrix --- sklearn/preprocessing/data.py | 2 +- sklearn/preprocessing/tests/test_data.py | 12 +++--- sklearn/utils/extmath.py | 54 +++++++++++++++++------- sklearn/utils/sparsefuncs_fast.pyx | 18 +++++--- sklearn/utils/tests/test_extmath.py | 26 ++++++++++++ sklearn/utils/tests/test_sparsefuncs.py | 30 +++++++------ 6 files changed, 101 insertions(+), 41 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 6f46d5f000412..d4d796a9bfcb5 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -660,7 +660,7 @@ def partial_fit(self, X, y=None): # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_ = .0 - self.n_samples_seen_ = 0 + self.n_samples_seen_ = np.zeros(X.shape[1]) if self.with_std: self.var_ = .0 else: diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f4358e48fc0b8..7e91e6aabac2b 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -203,7 +203,7 @@ def test_standard_scaler_1d(): np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), .0) assert_array_almost_equal(X_scaled.std(axis=0), 1.) - assert_equal(scaler.n_samples_seen_, X.shape[0]) + assert_equal(scaler.n_samples_seen_[0], X.shape[0]) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) @@ -283,7 +283,7 @@ def test_scaler_2d_arrays(): scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) - assert_equal(scaler.n_samples_seen_, n_samples) + assert_equal(scaler.n_samples_seen_[0], n_samples) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) @@ -399,7 +399,7 @@ def test_standard_scaler_partial_fit(): assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) assert_equal(scaler_batch.var_, scaler_incr.var_) # Nones - assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) + assert_equal(scaler_batch.n_samples_seen_[0], scaler_incr.n_samples_seen_[0]) # Test std after 1 step batch0 = slice(0, chunk_size) @@ -423,10 +423,10 @@ def test_standard_scaler_partial_fit(): assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, - n_samples_seen=scaler_incr.n_samples_seen_) + n_samples_seen=scaler_incr.n_samples_seen_[0]) assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) - assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) + assert_equal(scaler_batch.n_samples_seen_[0], scaler_incr.n_samples_seen_[0]) def test_standard_scaler_partial_fit_numerical_stability(): @@ -515,7 +515,7 @@ def test_standard_scaler_trasform_with_partial_fit(): assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) # (i+1) because the Scaler has been already fitted - assert_equal((i + 1), scaler_incr.n_samples_seen_) + assert_equal((i + 1), scaler_incr.n_samples_seen_[0]) def test_min_max_scaler_iris(): diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index e95ceb57497ae..cf20eed8c1d2b 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -643,7 +643,7 @@ def make_nonnegative(X, min_value=0): def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, - last_sample_count=0): + last_sample_count=0, ignore_nan=True): """Calculate mean update and a Youngs and Cramer variance update. last_mean and last_variance are statistics computed at the last step by the @@ -688,29 +688,53 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, # old = stats until now # new = the current increment # updated = the aggregated stats + flag = 0 # if flag == 1 then last_sample_count was an array + n_features = X.shape[1] + if isinstance(last_sample_count, np.ndarray): + flag = 1 + else: + last_sample_count *= np.ones(n_features) + last_sum = last_mean * last_sample_count - new_sum = X.sum(axis=0) + sum_func = np.nansum if ignore_nan else np.sum + new_sum = sum_func(X, axis=0) + new_sum[np.isnan(new_sum)] = 0 - new_sample_count = X.shape[0] + new_sample_count = np.count_nonzero(~np.isnan(X), axis=0) updated_sample_count = last_sample_count + new_sample_count updated_mean = (last_sum + new_sum) / updated_sample_count + updated_variance = np.zeros(n_features) if last_variance is None: updated_variance = None else: - new_unnormalized_variance = X.var(axis=0) * new_sample_count - if last_sample_count == 0: # Avoid division by 0 - updated_unnormalized_variance = new_unnormalized_variance - else: - last_over_new_count = last_sample_count / new_sample_count - last_unnormalized_variance = last_variance * last_sample_count - updated_unnormalized_variance = ( - last_unnormalized_variance + - new_unnormalized_variance + - last_over_new_count / updated_sample_count * - (last_sum / last_over_new_count - new_sum) ** 2) - updated_variance = updated_unnormalized_variance / updated_sample_count + var_func = np.nanvar if ignore_nan else np.var + new_unnormalized_variance = var_func(X, axis=0) + new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 + new_unnormalized_variance = (new_unnormalized_variance * + new_sample_count) + for i in xrange(n_features): + if updated_sample_count[i] == 0: # Avoid division by 0 + continue + # Avoid division by 0 + elif last_sample_count[i] == 0 or new_sample_count[i] == 0: + updated_unnormalized_variance = new_unnormalized_variance[i] + else: + last_over_new_count = (last_sample_count[i] / + new_sample_count[i]) + last_unnormalized_variance = (last_variance[i] * + last_sample_count[i]) + updated_unnormalized_variance = ( + last_unnormalized_variance + + new_unnormalized_variance[i] + + last_over_new_count / updated_sample_count[i] * + (last_sum[i] / last_over_new_count - new_sum[i]) ** 2) + updated_variance[i] = (updated_unnormalized_variance / + updated_sample_count[i]) + + if flag == 0: # If n_sample_count was not an array + updated_sample_count = updated_sample_count[0] return updated_mean, updated_variance, updated_sample_count diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index ff5465deb9adf..ffaf1f5af770a 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -136,8 +136,9 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, counts[col_ind] += 1 for i in xrange(n_features): - variances[i] += (n_samples_feat[i] - counts[i]) * means[i] ** 2 - variances[i] /= n_samples_feat[i] + if n_samples_feat[i]: + variances[i] += (n_samples_feat[i] - counts[i]) * means[i] ** 2 + variances[i] /= n_samples_feat[i] return means, variances @@ -202,15 +203,20 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, counts = endptr - startptr n_samples_feat = n_samples + for j in xrange(startptr, endptr): + if isnan(X_data[j]) and ignore_nan: + n_samples_feat -= 1 + counts -= 1 + if n_samples_feat == 0: # Avoid division by Zero + continue + for j in xrange(startptr, endptr): x_i = X_data[j] if isnan(x_i) and ignore_nan: - n_samples_feat -= 1 continue means[i] += x_i - # Avoid division by Zero in case where all values are NaN in feature i - if n_samples_feat: - means[i] /= n_samples_feat + + means[i] /= n_samples_feat for j in xrange(startptr, endptr): x_i = X_data[j] diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index f53b814c70084..fe6a1aba7361e 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -5,6 +5,7 @@ # License: BSD 3 clause import numpy as np +from numpy.testing import assert_allclose from scipy import sparse from scipy import linalg from scipy import stats @@ -467,6 +468,31 @@ def naive_log_logistic(x): assert_array_almost_equal(log_logistic(extreme_x), [-100, 0]) +def test_incremental_mean_and_var_nan(): + # Test mean and variance when an array has floating NaN values + A = np.array([[600, 470, 170, 430, np.nan], + [600, np.nan, 170, 430, 300], + [np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan]]) + X1 = A[:2, :] + X2 = A[2:, :] + X_means = np.nanmean(X1, axis=0) + X_variances = np.nanvar(X1, axis=0) + X_count = np.count_nonzero(~np.isnan(X1), axis=0) + A_means = np.nanmean(A, axis=0) + A_variances = np.nanvar(A, axis=0) + A_count = np.count_nonzero(~np.isnan(A), axis=0) + + final_means, final_variances, final_count = \ + _incremental_mean_and_var(X2, X_means, X_variances, X_count) + assert_allclose(A_means, final_means, equal_nan=True) + print A_variances + print X_variances + print final_variances + assert_allclose(A_variances, final_variances, equal_nan=True) + assert_allclose(A_count, final_count, equal_nan=True) + + def test_incremental_variance_update_formulas(): # Test Youngs and Cramer incremental variance formulas. # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index c606a16849641..586eecac88a38 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -4,7 +4,7 @@ from scipy import linalg from numpy.testing import (assert_array_almost_equal, assert_array_equal, - assert_equal) + assert_equal, assert_allclose) from numpy.random import RandomState from sklearn.datasets import make_classification @@ -22,10 +22,14 @@ def test_mean_variance_axis0_nan(): - X = np.zeros(10).reshape(5, 2) * np.nan - X[0, 0] = 1 - X_csr = sp.csr_matrix(X) - X_csc = sp.csc_matrix(X) + A = np.zeros(10).reshape(5, 2) * np.nan + A[0, 0] = 1 + A_means = np.nanmean(A, axis=0) + A_means[np.isnan(A_means)] = 0 + A_vars = np.nanvar(A, axis=0) + A_vars[np.isnan(A_vars)] = 0 + X_csr = sp.csr_matrix(A) + X_csc = sp.csc_matrix(A) expected_dtypes = [(np.float32, np.float32), (np.float64, np.float64), @@ -39,8 +43,8 @@ def test_mean_variance_axis0_nan(): X_means, X_vars = mean_variance_axis(X_sparse, axis=0) assert_equal(X_means.dtype, output_dtype) assert_equal(X_vars.dtype, output_dtype) - np.allclose(X_means, np.array([1, np.nan], dtype=output_dtype)) - np.allclose(X_vars, np.array([0, np.nan], dtype=output_dtype)) + assert_allclose(X_means, A_means) + assert_allclose(X_vars, A_vars) def test_mean_variance_axis0(): @@ -134,13 +138,13 @@ def test_incr_mean_variance_axis_nan(): X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n=0, last_n_feat=last_n) - np.allclose(X_means, X_means_incr) - np.allclose(X_vars, X_vars_incr) + assert_allclose(X_means, X_means_incr) + assert_allclose(X_vars, X_vars_incr) X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) - np.allclose(X_means, X_means_incr) - np.allclose(X_vars, X_vars_incr) + assert_allclose(X_means, X_means_incr) + assert_allclose(X_vars, X_vars_incr) # Test _incremental_mean_and_var with whole data X = data_chunk @@ -163,8 +167,8 @@ def test_incr_mean_variance_axis_nan(): last_n_feat=last_n) assert_equal(X_means_incr.dtype, output_dtype) assert_equal(X_vars_incr.dtype, output_dtype) - np.allclose(X_means, X_means_incr) - np.allclose(X_vars, X_vars_incr) + assert_allclose(X_means, X_means_incr) + assert_allclose(X_vars, X_vars_incr) def test_incr_mean_variance_axis(): From fec502fd4ffd2b12f5faa9a3393224564f14902e Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 9 Feb 2018 02:14:21 +0530 Subject: [PATCH 08/11] fixed test cases for sparse matrix --- sklearn/utils/tests/test_sparsefuncs.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index 586eecac88a38..51a8fca5425bd 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -32,9 +32,7 @@ def test_mean_variance_axis0_nan(): X_csc = sp.csc_matrix(A) expected_dtypes = [(np.float32, np.float32), - (np.float64, np.float64), - (np.int32, np.float64), - (np.int64, np.float64)] + (np.float64, np.float64)] for input_dtype, output_dtype in expected_dtypes: From 5b5f72d471ee5f3a3933c3542932941878a71744 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 9 Feb 2018 11:30:20 +0530 Subject: [PATCH 09/11] silenced warnings in _incremental_mean_and_var --- sklearn/utils/extmath.py | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index cf20eed8c1d2b..e981914eab87f 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -715,23 +715,26 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_unnormalized_variance = (new_unnormalized_variance * new_sample_count) for i in xrange(n_features): - if updated_sample_count[i] == 0: # Avoid division by 0 - continue - # Avoid division by 0 - elif last_sample_count[i] == 0 or new_sample_count[i] == 0: - updated_unnormalized_variance = new_unnormalized_variance[i] - else: - last_over_new_count = (last_sample_count[i] / - new_sample_count[i]) - last_unnormalized_variance = (last_variance[i] * - last_sample_count[i]) - updated_unnormalized_variance = ( - last_unnormalized_variance + - new_unnormalized_variance[i] + - last_over_new_count / updated_sample_count[i] * - (last_sum[i] / last_over_new_count - new_sum[i]) ** 2) - updated_variance[i] = (updated_unnormalized_variance / - updated_sample_count[i]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + if updated_sample_count[i] == 0: # Avoid division by 0 + continue + # Avoid division by 0 + elif last_sample_count[i] == 0 or new_sample_count[i] == 0: + updated_unnormalized_variance = \ + new_unnormalized_variance[i] + else: + last_over_new_count = (last_sample_count[i] / + new_sample_count[i]) + last_unnormalized_variance = (last_variance[i] * + last_sample_count[i]) + updated_unnormalized_variance = ( + last_unnormalized_variance + + new_unnormalized_variance[i] + + last_over_new_count / updated_sample_count[i] * + (last_sum[i] / last_over_new_count - new_sum[i]) ** 2) + updated_variance[i] = (updated_unnormalized_variance / + updated_sample_count[i]) if flag == 0: # If n_sample_count was not an array updated_sample_count = updated_sample_count[0] From a2873925196f1c24f4a8b18a098633d0e8004a78 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 9 Feb 2018 11:49:34 +0530 Subject: [PATCH 10/11] Revert "silenced warnings in _incremental_mean_and_var" This reverts commit 5b5f72d471ee5f3a3933c3542932941878a71744. --- sklearn/utils/extmath.py | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index e981914eab87f..cf20eed8c1d2b 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -715,26 +715,23 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_unnormalized_variance = (new_unnormalized_variance * new_sample_count) for i in xrange(n_features): - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - if updated_sample_count[i] == 0: # Avoid division by 0 - continue - # Avoid division by 0 - elif last_sample_count[i] == 0 or new_sample_count[i] == 0: - updated_unnormalized_variance = \ - new_unnormalized_variance[i] - else: - last_over_new_count = (last_sample_count[i] / - new_sample_count[i]) - last_unnormalized_variance = (last_variance[i] * - last_sample_count[i]) - updated_unnormalized_variance = ( - last_unnormalized_variance + - new_unnormalized_variance[i] + - last_over_new_count / updated_sample_count[i] * - (last_sum[i] / last_over_new_count - new_sum[i]) ** 2) - updated_variance[i] = (updated_unnormalized_variance / - updated_sample_count[i]) + if updated_sample_count[i] == 0: # Avoid division by 0 + continue + # Avoid division by 0 + elif last_sample_count[i] == 0 or new_sample_count[i] == 0: + updated_unnormalized_variance = new_unnormalized_variance[i] + else: + last_over_new_count = (last_sample_count[i] / + new_sample_count[i]) + last_unnormalized_variance = (last_variance[i] * + last_sample_count[i]) + updated_unnormalized_variance = ( + last_unnormalized_variance + + new_unnormalized_variance[i] + + last_over_new_count / updated_sample_count[i] * + (last_sum[i] / last_over_new_count - new_sum[i]) ** 2) + updated_variance[i] = (updated_unnormalized_variance / + updated_sample_count[i]) if flag == 0: # If n_sample_count was not an array updated_sample_count = updated_sample_count[0] From 3f64821c48506756631e51a61a766e8dfd75bbb8 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 9 Feb 2018 12:17:36 +0530 Subject: [PATCH 11/11] fixed error in _incremental_mean_and_var --- sklearn/utils/extmath.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index cf20eed8c1d2b..56aff84586f36 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -701,6 +701,8 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_sum[np.isnan(new_sum)] = 0 new_sample_count = np.count_nonzero(~np.isnan(X), axis=0) + if not isinstance(new_sample_count, np.ndarray): + new_sample_count *= np.ones(n_features) updated_sample_count = last_sample_count + new_sample_count updated_mean = (last_sum + new_sum) / updated_sample_count