From 8cf9dbd18de602fe67b6dd65aa420560cb28f8e3 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 11 Feb 2018 00:20:18 +0530 Subject: [PATCH 01/23] increment_mean_and_var can now handle NaN values --- sklearn/preprocessing/data.py | 7 ++++++- sklearn/utils/estimator_checks.py | 3 +++ sklearn/utils/extmath.py | 25 ++++++++++++++++++++----- sklearn/utils/tests/test_extmath.py | 27 +++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 6 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bbd2fae10c0ec..7ad99e2311c23 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -619,13 +619,18 @@ def partial_fit(self, X, y=None): Ignored """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) + warn_on_dtype=True, estimator=self, + force_all_finite='allow-nan', dtype=FLOAT_DTYPES) # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var # See incr_mean_variance_axis and _incremental_mean_variance_axis if sparse.issparse(X): + X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, + warn_on_dtype=True, estimator=self, + dtype=FLOAT_DTYPES) + if self.with_mean: raise ValueError( "Cannot center sparse matrices: pass `with_mean=False` " diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index b079c37f7bea2..9add53f720b46 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -70,6 +70,7 @@ 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression', 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] +ALLOW_NAN = ['StandardScaler'] def _yield_non_meta_checks(name, estimator): @@ -1024,6 +1025,8 @@ def check_estimators_nan_inf(name, estimator_orig): error_string_transform = ("Estimator doesn't check for NaN and inf in" " transform.") for X_train in [X_train_nan, X_train_inf]: + if np.any(np.isnan(X_train)) and name in ALLOW_NAN: + continue # catch deprecation warnings with ignore_warnings(category=(DeprecationWarning, FutureWarning)): estimator = clone(estimator_orig) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index e95ceb57497ae..4124d002e9356 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -643,7 +643,7 @@ def make_nonnegative(X, min_value=0): def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, - last_sample_count=0): + last_sample_count=0, ignore_nan=True): """Calculate mean update and a Youngs and Cramer variance update. last_mean and last_variance are statistics computed at the last step by the @@ -688,21 +688,32 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, # old = stats until now # new = the current increment # updated = the aggregated stats + if not isinstance(last_sample_count, np.ndarray): + last_sample_count *= np.ones(X.shape[1], dtype=np.float) last_sum = last_mean * last_sample_count - new_sum = X.sum(axis=0) + sum_func = np.nansum if ignore_nan else np.sum + new_sum = sum_func(X, axis=0) - new_sample_count = X.shape[0] + new_sample_count = np.count_nonzero(~np.isnan(X), axis=0) + if not isinstance(new_sample_count, np.ndarray): + new_sample_count *= np.ones(X.shape[1], dtype=np.float) # If the input array is 1D updated_sample_count = last_sample_count + new_sample_count updated_mean = (last_sum + new_sum) / updated_sample_count + updated_mean[np.isinf(updated_mean)] = 0 if last_variance is None: updated_variance = None else: - new_unnormalized_variance = X.var(axis=0) * new_sample_count - if last_sample_count == 0: # Avoid division by 0 + var_func = np.nanvar if ignore_nan else np.var + new_unnormalized_variance = var_func(X, axis=0) + # To put zero in places where the np.nanvar returned NaN value + new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 + new_unnormalized_variance = new_unnormalized_variance * new_sample_count + if (last_sample_count == 0).all(): # Avoid division by 0 updated_unnormalized_variance = new_unnormalized_variance else: + warnings.filterwarnings('ignore') # as division by 0 might happen last_over_new_count = last_sample_count / new_sample_count last_unnormalized_variance = last_variance * last_sample_count updated_unnormalized_variance = ( @@ -711,7 +722,11 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, last_over_new_count / updated_sample_count * (last_sum / last_over_new_count - new_sum) ** 2) updated_variance = updated_unnormalized_variance / updated_sample_count + updated_variance[np.isnan(updated_variance)] = 0 + # return vector only when required + if (updated_sample_count[0] == updated_sample_count).all(): + updated_sample_count = updated_sample_count[0] return updated_mean, updated_variance, updated_sample_count diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index f53b814c70084..01caf3d06714e 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -5,6 +5,8 @@ # License: BSD 3 clause import numpy as np +from numpy.testing import assert_allclose + from scipy import sparse from scipy import linalg from scipy import stats @@ -467,6 +469,31 @@ def naive_log_logistic(x): assert_array_almost_equal(log_logistic(extreme_x), [-100, 0]) +def test_incremental_mean_and_var_nan(): + # Test mean and variance when an array has floating NaN values + A = np.array([[600, 470, 170, 430, np.nan], + [600, np.nan, 170, 430, 300], + [np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan]]) + X1 = A[:2, :] + X2 = A[2:, :] + X_means = np.nanmean(X1, axis=0) + X_variances = np.nanvar(X1, axis=0) + X_count = np.count_nonzero(~np.isnan(X1), axis=0) + A_means = np.nanmean(A, axis=0) + A_variances = np.nanvar(A, axis=0) + A_count = np.count_nonzero(~np.isnan(A), axis=0) + + final_means, final_variances, final_count = \ + _incremental_mean_and_var(X2, X_means, X_variances, X_count) + assert_allclose(A_means, final_means, equal_nan=True) + print A_variances + print X_variances + print final_variances + assert_allclose(A_variances, final_variances, equal_nan=True) + assert_allclose(A_count, final_count, equal_nan=True) + + def test_incremental_variance_update_formulas(): # Test Youngs and Cramer incremental variance formulas. # Doggie data from http://www.mathsisfun.com/data/standard-deviation.html From f7d7381595227e72d187a35a59f5f083325af67f Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 11 Feb 2018 00:37:42 +0530 Subject: [PATCH 02/23] fixed errors --- sklearn/preprocessing/data.py | 1 + sklearn/utils/extmath.py | 6 ++++-- sklearn/utils/tests/test_extmath.py | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 7ad99e2311c23..a4053aa8042a9 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -627,6 +627,7 @@ def partial_fit(self, X, y=None): # See incr_mean_variance_axis and _incremental_mean_variance_axis if sparse.issparse(X): + # FIXME: remove this check statement X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 4124d002e9356..693dc0c88dbc1 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -696,7 +696,8 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_sample_count = np.count_nonzero(~np.isnan(X), axis=0) if not isinstance(new_sample_count, np.ndarray): - new_sample_count *= np.ones(X.shape[1], dtype=np.float) # If the input array is 1D + # If the input array is 1D + new_sample_count *= np.ones(X.shape[1], dtype=np.float) updated_sample_count = last_sample_count + new_sample_count updated_mean = (last_sum + new_sum) / updated_sample_count @@ -709,7 +710,8 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_unnormalized_variance = var_func(X, axis=0) # To put zero in places where the np.nanvar returned NaN value new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 - new_unnormalized_variance = new_unnormalized_variance * new_sample_count + new_unnormalized_variance = (new_unnormalized_variance * + new_sample_count) if (last_sample_count == 0).all(): # Avoid division by 0 updated_unnormalized_variance = new_unnormalized_variance else: diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 01caf3d06714e..c9456f4645213 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -472,9 +472,9 @@ def naive_log_logistic(x): def test_incremental_mean_and_var_nan(): # Test mean and variance when an array has floating NaN values A = np.array([[600, 470, 170, 430, np.nan], - [600, np.nan, 170, 430, 300], - [np.nan, np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, np.nan, np.nan]]) + [600, np.nan, 170, 430, 300], + [np.nan, np.nan, np.nan, np.nan, np.nan], + [np.nan, np.nan, np.nan, np.nan, np.nan]]) X1 = A[:2, :] X2 = A[2:, :] X_means = np.nanmean(X1, axis=0) From e3421e9dcfb0a1b63769f1519a8aa03160ad6332 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 11 Feb 2018 11:19:33 +0530 Subject: [PATCH 03/23] removed print statement which were added mistakenly --- sklearn/utils/tests/test_extmath.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index c9456f4645213..31b306c6c5305 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -470,6 +470,7 @@ def naive_log_logistic(x): def test_incremental_mean_and_var_nan(): + print "Running test_incremental_mean_and_var" # Test mean and variance when an array has floating NaN values A = np.array([[600, 470, 170, 430, np.nan], [600, np.nan, 170, 430, 300], @@ -487,9 +488,6 @@ def test_incremental_mean_and_var_nan(): final_means, final_variances, final_count = \ _incremental_mean_and_var(X2, X_means, X_variances, X_count) assert_allclose(A_means, final_means, equal_nan=True) - print A_variances - print X_variances - print final_variances assert_allclose(A_variances, final_variances, equal_nan=True) assert_allclose(A_count, final_count, equal_nan=True) From f9244019f1d30d5ccf6abfdad7c993723319562f Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 11 Feb 2018 11:49:23 +0530 Subject: [PATCH 04/23] removed unwanted print statements --- sklearn/utils/tests/test_extmath.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 31b306c6c5305..921aee9983b7a 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -470,7 +470,6 @@ def naive_log_logistic(x): def test_incremental_mean_and_var_nan(): - print "Running test_incremental_mean_and_var" # Test mean and variance when an array has floating NaN values A = np.array([[600, 470, 170, 430, np.nan], [600, np.nan, 170, 430, 300], From c812be9359d8b20fb5682795cb80ebadaf4d22dd Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 11 Feb 2018 12:32:29 +0530 Subject: [PATCH 05/23] trying to fix the errors --- sklearn/utils/extmath.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 693dc0c88dbc1..b412c179f3ff6 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -693,6 +693,8 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, last_sum = last_mean * last_sample_count sum_func = np.nansum if ignore_nan else np.sum new_sum = sum_func(X, axis=0) + if not isinstance(new_sum, np.ndarray): + new_sum *= np.ones(X.shape[1], dtype=np.float) new_sample_count = np.count_nonzero(~np.isnan(X), axis=0) if not isinstance(new_sample_count, np.ndarray): @@ -708,6 +710,8 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, else: var_func = np.nanvar if ignore_nan else np.var new_unnormalized_variance = var_func(X, axis=0) + if not isinstance(new_unnormalized_variance, np.ndarray): + new_unnormalized_variance *= np.ones(X.shape[1], dtype=np.float) # To put zero in places where the np.nanvar returned NaN value new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 new_unnormalized_variance = (new_unnormalized_variance * From 4758bc9b7e54d724ec2063f305ab53bfd70fd932 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 11:38:58 +0530 Subject: [PATCH 06/23] added test cases where there is a chance to get a 1D matrix --- sklearn/utils/tests/test_extmath.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 921aee9983b7a..2239c5dfbdcfe 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -475,11 +475,11 @@ def test_incremental_mean_and_var_nan(): [600, np.nan, 170, 430, 300], [np.nan, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, np.nan]]) - X1 = A[:2, :] - X2 = A[2:, :] - X_means = np.nanmean(X1, axis=0) - X_variances = np.nanvar(X1, axis=0) - X_count = np.count_nonzero(~np.isnan(X1), axis=0) + X1 = A[:3, :] + X2 = A[3:, :] + X_means, X_variances, X_count = \ + _incremental_mean_and_var(X1, [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]) A_means = np.nanmean(A, axis=0) A_variances = np.nanvar(A, axis=0) A_count = np.count_nonzero(~np.isnan(A), axis=0) From df954accd84c70c5d680c189e0766d7e92dc7a87 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 12:06:43 +0530 Subject: [PATCH 07/23] check if there is a green tick when cases involving NaN is commented out --- sklearn/utils/tests/test_extmath.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 2239c5dfbdcfe..fd7bfee1764fa 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -469,6 +469,7 @@ def naive_log_logistic(x): assert_array_almost_equal(log_logistic(extreme_x), [-100, 0]) +''' def test_incremental_mean_and_var_nan(): # Test mean and variance when an array has floating NaN values A = np.array([[600, 470, 170, 430, np.nan], @@ -489,6 +490,7 @@ def test_incremental_mean_and_var_nan(): assert_allclose(A_means, final_means, equal_nan=True) assert_allclose(A_variances, final_variances, equal_nan=True) assert_allclose(A_count, final_count, equal_nan=True) +''' def test_incremental_variance_update_formulas(): From 75d221bcf9258338fdf4789d32d2d9d16801d1b4 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 18:30:48 +0530 Subject: [PATCH 08/23] removing np.count_nonzero() from the code --- sklearn/utils/extmath.py | 2 +- sklearn/utils/tests/test_extmath.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index b412c179f3ff6..7eb59f55d5f37 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -696,7 +696,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, if not isinstance(new_sum, np.ndarray): new_sum *= np.ones(X.shape[1], dtype=np.float) - new_sample_count = np.count_nonzero(~np.isnan(X), axis=0) + new_sample_count = np.sum(~np.isnan(X), axis=0) if not isinstance(new_sample_count, np.ndarray): # If the input array is 1D new_sample_count *= np.ones(X.shape[1], dtype=np.float) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index fd7bfee1764fa..2239c5dfbdcfe 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -469,7 +469,6 @@ def naive_log_logistic(x): assert_array_almost_equal(log_logistic(extreme_x), [-100, 0]) -''' def test_incremental_mean_and_var_nan(): # Test mean and variance when an array has floating NaN values A = np.array([[600, 470, 170, 430, np.nan], @@ -490,7 +489,6 @@ def test_incremental_mean_and_var_nan(): assert_allclose(A_means, final_means, equal_nan=True) assert_allclose(A_variances, final_variances, equal_nan=True) assert_allclose(A_count, final_count, equal_nan=True) -''' def test_incremental_variance_update_formulas(): From a3bb041668f68eb4b860b0f2636c6b390ef07116 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 19:27:18 +0530 Subject: [PATCH 09/23] removed np.count_non_zero() in test cases --- sklearn/utils/tests/test_extmath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 2239c5dfbdcfe..f62b4c6b5e11d 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -482,7 +482,7 @@ def test_incremental_mean_and_var_nan(): [0, 0, 0, 0, 0]) A_means = np.nanmean(A, axis=0) A_variances = np.nanvar(A, axis=0) - A_count = np.count_nonzero(~np.isnan(A), axis=0) + A_count = [2, 1, 2, 2, 1] final_means, final_variances, final_count = \ _incremental_mean_and_var(X2, X_means, X_variances, X_count) From 8db4311952c75292df850d013adf5f899bfcafac Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 19:55:46 +0530 Subject: [PATCH 10/23] resolved some errors --- sklearn/utils/tests/test_extmath.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index f62b4c6b5e11d..6c34d0340f11c 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -5,7 +5,6 @@ # License: BSD 3 clause import numpy as np -from numpy.testing import assert_allclose from scipy import sparse from scipy import linalg @@ -486,9 +485,9 @@ def test_incremental_mean_and_var_nan(): final_means, final_variances, final_count = \ _incremental_mean_and_var(X2, X_means, X_variances, X_count) - assert_allclose(A_means, final_means, equal_nan=True) - assert_allclose(A_variances, final_variances, equal_nan=True) - assert_allclose(A_count, final_count, equal_nan=True) + assert_almost_equal(A_means, final_means, equal_nan=True) + assert_almost_equal(A_variances, final_variances, equal_nan=True) + assert_almost_equal(A_count, final_count, equal_nan=True) def test_incremental_variance_update_formulas(): From 1f115f16ad79d42e99b7f2874b2a4b00e3f45d0b Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 20:18:08 +0530 Subject: [PATCH 11/23] remove errors --- sklearn/utils/tests/test_extmath.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 6c34d0340f11c..609071eeef5f3 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -485,9 +485,9 @@ def test_incremental_mean_and_var_nan(): final_means, final_variances, final_count = \ _incremental_mean_and_var(X2, X_means, X_variances, X_count) - assert_almost_equal(A_means, final_means, equal_nan=True) - assert_almost_equal(A_variances, final_variances, equal_nan=True) - assert_almost_equal(A_count, final_count, equal_nan=True) + assert_almost_equal(A_means, final_means) + assert_almost_equal(A_variances, final_variances) + assert_almost_equal(A_count, final_count) def test_incremental_variance_update_formulas(): From 566ad152dbf1614db10cc3f108c5a3f24defaa1f Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 21:10:36 +0530 Subject: [PATCH 12/23] resolving errors --- sklearn/utils/extmath.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 7eb59f55d5f37..7abbee24a6097 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -704,6 +704,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, updated_mean = (last_sum + new_sum) / updated_sample_count updated_mean[np.isinf(updated_mean)] = 0 + updated_mean[np.isnan(updated_mean)] = 0 if last_variance is None: updated_variance = None @@ -729,6 +730,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, (last_sum / last_over_new_count - new_sum) ** 2) updated_variance = updated_unnormalized_variance / updated_sample_count updated_variance[np.isnan(updated_variance)] = 0 + updated_variance[np.isinf(updated_variance)] = 0 # return vector only when required if (updated_sample_count[0] == updated_sample_count).all(): From 499133db9857b3d2605550bb088d345f88108c43 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 21:12:21 +0530 Subject: [PATCH 13/23] removing errors --- sklearn/utils/extmath.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 7abbee24a6097..ab63e4f0e210f 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -695,6 +695,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_sum = sum_func(X, axis=0) if not isinstance(new_sum, np.ndarray): new_sum *= np.ones(X.shape[1], dtype=np.float) + new_sum[np.isnan(new_sum)] = 0 new_sample_count = np.sum(~np.isnan(X), axis=0) if not isinstance(new_sample_count, np.ndarray): From 032b058a11b098aca078fc73522280479c9310a0 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Feb 2018 22:59:58 +0530 Subject: [PATCH 14/23] removed errors --- sklearn/utils/extmath.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index ab63e4f0e210f..e170997a0e351 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -694,13 +694,13 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, sum_func = np.nansum if ignore_nan else np.sum new_sum = sum_func(X, axis=0) if not isinstance(new_sum, np.ndarray): - new_sum *= np.ones(X.shape[1], dtype=np.float) + new_sum *= np.ones(X.shape[1]) new_sum[np.isnan(new_sum)] = 0 new_sample_count = np.sum(~np.isnan(X), axis=0) if not isinstance(new_sample_count, np.ndarray): # If the input array is 1D - new_sample_count *= np.ones(X.shape[1], dtype=np.float) + new_sample_count *= np.ones(X.shape[1]) updated_sample_count = last_sample_count + new_sample_count updated_mean = (last_sum + new_sum) / updated_sample_count @@ -713,7 +713,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, var_func = np.nanvar if ignore_nan else np.var new_unnormalized_variance = var_func(X, axis=0) if not isinstance(new_unnormalized_variance, np.ndarray): - new_unnormalized_variance *= np.ones(X.shape[1], dtype=np.float) + new_unnormalized_variance *= np.ones(X.shape[1]) # To put zero in places where the np.nanvar returned NaN value new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 new_unnormalized_variance = (new_unnormalized_variance * From e6c05214b1b601e4f8c5e3b4462296576408b69f Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Tue, 13 Feb 2018 15:06:11 +0530 Subject: [PATCH 15/23] removed errors and modified test cases --- sklearn/utils/extmath.py | 9 +++++---- sklearn/utils/tests/test_extmath.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index e170997a0e351..0739d5d8cc3c6 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -689,18 +689,19 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, # new = the current increment # updated = the aggregated stats if not isinstance(last_sample_count, np.ndarray): - last_sample_count *= np.ones(X.shape[1], dtype=np.float) + # when the index is -1 it will pick the last value in shape array + last_sample_count *= np.ones(X.shape[-1]) last_sum = last_mean * last_sample_count sum_func = np.nansum if ignore_nan else np.sum new_sum = sum_func(X, axis=0) if not isinstance(new_sum, np.ndarray): - new_sum *= np.ones(X.shape[1]) + new_sum *= np.ones(X.shape[-1]) new_sum[np.isnan(new_sum)] = 0 new_sample_count = np.sum(~np.isnan(X), axis=0) if not isinstance(new_sample_count, np.ndarray): # If the input array is 1D - new_sample_count *= np.ones(X.shape[1]) + new_sample_count *= np.ones(X.shape[-1]) updated_sample_count = last_sample_count + new_sample_count updated_mean = (last_sum + new_sum) / updated_sample_count @@ -713,7 +714,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, var_func = np.nanvar if ignore_nan else np.var new_unnormalized_variance = var_func(X, axis=0) if not isinstance(new_unnormalized_variance, np.ndarray): - new_unnormalized_variance *= np.ones(X.shape[1]) + new_unnormalized_variance *= np.ones(X.shape[-1]) # To put zero in places where the np.nanvar returned NaN value new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 new_unnormalized_variance = (new_unnormalized_variance * diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 609071eeef5f3..48bb91a53e2c3 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -475,7 +475,7 @@ def test_incremental_mean_and_var_nan(): [np.nan, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, np.nan]]) X1 = A[:3, :] - X2 = A[3:, :] + X2 = np.array([np.nan, np.nan, np.nan, np.nan, np.nan]) X_means, X_variances, X_count = \ _incremental_mean_and_var(X1, [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]) From 7810d6e204b91a1631d0cac2035f2f686631373e Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Thu, 15 Feb 2018 21:07:23 +0530 Subject: [PATCH 16/23] changes in the code and removed cases for 1D matrix --- sklearn/utils/extmath.py | 10 +--------- sklearn/utils/tests/test_extmath.py | 2 +- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 0739d5d8cc3c6..26f83dfd74384 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -694,16 +694,12 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, last_sum = last_mean * last_sample_count sum_func = np.nansum if ignore_nan else np.sum new_sum = sum_func(X, axis=0) - if not isinstance(new_sum, np.ndarray): - new_sum *= np.ones(X.shape[-1]) new_sum[np.isnan(new_sum)] = 0 new_sample_count = np.sum(~np.isnan(X), axis=0) - if not isinstance(new_sample_count, np.ndarray): - # If the input array is 1D - new_sample_count *= np.ones(X.shape[-1]) updated_sample_count = last_sample_count + new_sample_count + warnings.filterwarnings('ignore') # as division by 0 might happen updated_mean = (last_sum + new_sum) / updated_sample_count updated_mean[np.isinf(updated_mean)] = 0 updated_mean[np.isnan(updated_mean)] = 0 @@ -713,16 +709,12 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, else: var_func = np.nanvar if ignore_nan else np.var new_unnormalized_variance = var_func(X, axis=0) - if not isinstance(new_unnormalized_variance, np.ndarray): - new_unnormalized_variance *= np.ones(X.shape[-1]) - # To put zero in places where the np.nanvar returned NaN value new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 new_unnormalized_variance = (new_unnormalized_variance * new_sample_count) if (last_sample_count == 0).all(): # Avoid division by 0 updated_unnormalized_variance = new_unnormalized_variance else: - warnings.filterwarnings('ignore') # as division by 0 might happen last_over_new_count = last_sample_count / new_sample_count last_unnormalized_variance = last_variance * last_sample_count updated_unnormalized_variance = ( diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 48bb91a53e2c3..609071eeef5f3 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -475,7 +475,7 @@ def test_incremental_mean_and_var_nan(): [np.nan, np.nan, np.nan, np.nan, np.nan], [np.nan, np.nan, np.nan, np.nan, np.nan]]) X1 = A[:3, :] - X2 = np.array([np.nan, np.nan, np.nan, np.nan, np.nan]) + X2 = A[3:, :] X_means, X_variances, X_count = \ _incremental_mean_and_var(X1, [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]) From be05d167ffec76c8d2a93d54978f93943747fbc0 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Thu, 15 Feb 2018 22:10:56 +0530 Subject: [PATCH 17/23] removed pep8 errors --- sklearn/utils/extmath.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 26f83dfd74384..32492cd04192f 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -699,7 +699,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_sample_count = np.sum(~np.isnan(X), axis=0) updated_sample_count = last_sample_count + new_sample_count - warnings.filterwarnings('ignore') # as division by 0 might happen + warnings.filterwarnings('ignore') # as division by 0 might happen updated_mean = (last_sum + new_sum) / updated_sample_count updated_mean[np.isinf(updated_mean)] = 0 updated_mean[np.isnan(updated_mean)] = 0 @@ -723,6 +723,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, last_over_new_count / updated_sample_count * (last_sum / last_over_new_count - new_sum) ** 2) updated_variance = updated_unnormalized_variance / updated_sample_count + # As division by Zero might happen updated_variance[np.isnan(updated_variance)] = 0 updated_variance[np.isinf(updated_variance)] = 0 From 87667ab52de51512cda2d459e8573013eda5e168 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Thu, 15 Feb 2018 23:21:38 +0530 Subject: [PATCH 18/23] removed if condition at line +722 of `extmath.py` and changed some other parts of the code --- sklearn/utils/extmath.py | 25 +++++++++++++------------ sklearn/utils/tests/test_extmath.py | 12 ++++++------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 32492cd04192f..e9d6989f00ce5 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -710,18 +710,19 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, var_func = np.nanvar if ignore_nan else np.var new_unnormalized_variance = var_func(X, axis=0) new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 - new_unnormalized_variance = (new_unnormalized_variance * - new_sample_count) - if (last_sample_count == 0).all(): # Avoid division by 0 - updated_unnormalized_variance = new_unnormalized_variance - else: - last_over_new_count = last_sample_count / new_sample_count - last_unnormalized_variance = last_variance * last_sample_count - updated_unnormalized_variance = ( - last_unnormalized_variance + - new_unnormalized_variance + - last_over_new_count / updated_sample_count * - (last_sum / last_over_new_count - new_sum) ** 2) + new_unnormalized_variance *= new_sample_count + last_over_new_count = last_sample_count / new_sample_count + last_unnormalized_variance = last_variance * last_sample_count + updated_unnormalized_variance = ( + last_over_new_count / updated_sample_count * + (last_sum / last_over_new_count - new_sum) ** 2) + # updated_unnormalized_variance can be both NaN or Inf + updated_unnormalized_variance[np.isnan( + updated_unnormalized_variance)] = 0 + updated_unnormalized_variance[np.isinf( + updated_unnormalized_variance)] = 0 + updated_unnormalized_variance += (last_unnormalized_variance + + new_unnormalized_variance) updated_variance = updated_unnormalized_variance / updated_sample_count # As division by Zero might happen updated_variance[np.isnan(updated_variance)] = 0 diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 609071eeef5f3..4d2f0d4858745 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -476,15 +476,15 @@ def test_incremental_mean_and_var_nan(): [np.nan, np.nan, np.nan, np.nan, np.nan]]) X1 = A[:3, :] X2 = A[3:, :] - X_means, X_variances, X_count = \ - _incremental_mean_and_var(X1, [0, 0, 0, 0, 0], [0, 0, 0, 0, 0], - [0, 0, 0, 0, 0]) + X_means, X_variances, X_count = _incremental_mean_and_var( + X1, np.array([0, 0, 0, 0, 0]), np.array([0, 0, 0, 0, 0]), + np.array([0, 0, 0, 0, 0])) A_means = np.nanmean(A, axis=0) A_variances = np.nanvar(A, axis=0) - A_count = [2, 1, 2, 2, 1] + A_count = np.array([2, 1, 2, 2, 1]) - final_means, final_variances, final_count = \ - _incremental_mean_and_var(X2, X_means, X_variances, X_count) + final_means, final_variances, final_count = _incremental_mean_and_var( + X2, X_means, X_variances, X_count) assert_almost_equal(A_means, final_means) assert_almost_equal(A_variances, final_variances) assert_almost_equal(A_count, final_count) From 8e41081cbce4aff4d37390d35be34df8a2e62552 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Fri, 16 Feb 2018 01:40:07 +0530 Subject: [PATCH 19/23] made `last_samples_seen` and `updated_sample_seen` array --- sklearn/decomposition/incremental_pca.py | 7 ++++--- sklearn/preprocessing/data.py | 5 +++-- sklearn/preprocessing/tests/test_data.py | 14 ++++++++------ sklearn/utils/extmath.py | 10 ++-------- sklearn/utils/tests/test_extmath.py | 8 +++++--- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/sklearn/decomposition/incremental_pca.py b/sklearn/decomposition/incremental_pca.py index 13e51090dd82e..39ed37293bb2d 100644 --- a/sklearn/decomposition/incremental_pca.py +++ b/sklearn/decomposition/incremental_pca.py @@ -243,9 +243,10 @@ def partial_fit(self, X, y=None, check_input=True): # Update stats - they are 0 if this is the fisrt step col_mean, col_var, n_total_samples = \ - _incremental_mean_and_var(X, last_mean=self.mean_, - last_variance=self.var_, - last_sample_count=self.n_samples_seen_) + _incremental_mean_and_var( + X, last_mean=self.mean_, last_variance=self.var_, + last_sample_count=self.n_samples_seen_ * np.ones(n_features)) + n_total_samples = n_total_samples[0] # Whitening if self.n_samples_seen_ == 0: diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index a4053aa8042a9..5bc876ba2f9d6 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -662,8 +662,9 @@ def partial_fit(self, X, y=None): self.var_ = None self.mean_, self.var_, self.n_samples_seen_ = \ - _incremental_mean_and_var(X, self.mean_, self.var_, - self.n_samples_seen_) + _incremental_mean_and_var( + X, self.mean_, self.var_, + self.n_samples_seen_ * np.ones(X.shape[1])) if self.with_std: self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index f4c3d3d571772..5ce86d2927b0e 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -203,7 +203,7 @@ def test_standard_scaler_1d(): np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), .0) assert_array_almost_equal(X_scaled.std(axis=0), 1.) - assert_equal(scaler.n_samples_seen_, X.shape[0]) + assert_almost_equal(scaler.n_samples_seen_, X.shape[0]) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) @@ -283,7 +283,7 @@ def test_scaler_2d_arrays(): scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) - assert_equal(scaler.n_samples_seen_, n_samples) + assert_almost_equal(scaler.n_samples_seen_, n_samples) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) @@ -399,7 +399,8 @@ def test_standard_scaler_partial_fit(): assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_) assert_equal(scaler_batch.var_, scaler_incr.var_) # Nones - assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) + assert_array_almost_equal( + scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) # Test std after 1 step batch0 = slice(0, chunk_size) @@ -423,10 +424,11 @@ def test_standard_scaler_partial_fit(): assert_correct_incr(i, batch_start=batch.start, batch_stop=batch.stop, n=n, chunk_size=chunk_size, - n_samples_seen=scaler_incr.n_samples_seen_) + n_samples_seen=scaler_incr.n_samples_seen_[0]) assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_) - assert_equal(scaler_batch.n_samples_seen_, scaler_incr.n_samples_seen_) + assert_array_almost_equal(scaler_batch.n_samples_seen_, + scaler_incr.n_samples_seen_) def test_standard_scaler_partial_fit_numerical_stability(): @@ -515,7 +517,7 @@ def test_standard_scaler_trasform_with_partial_fit(): assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) # (i+1) because the Scaler has been already fitted - assert_equal((i + 1), scaler_incr.n_samples_seen_) + assert_almost_equal((i + 1), scaler_incr.n_samples_seen_) def test_min_max_scaler_iris(): diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index e9d6989f00ce5..1cadc849850d8 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -664,7 +664,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, last_variance : array-like, shape: (n_features,) - last_sample_count : int + last_sample_count : array-like, shape: (n_features,) Returns ------- @@ -673,7 +673,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, updated_variance : array, shape (n_features,) If None, only mean is computed - updated_sample_count : int + updated_sample_count : array shape (n_features,) References ---------- @@ -688,9 +688,6 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, # old = stats until now # new = the current increment # updated = the aggregated stats - if not isinstance(last_sample_count, np.ndarray): - # when the index is -1 it will pick the last value in shape array - last_sample_count *= np.ones(X.shape[-1]) last_sum = last_mean * last_sample_count sum_func = np.nansum if ignore_nan else np.sum new_sum = sum_func(X, axis=0) @@ -728,9 +725,6 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, updated_variance[np.isnan(updated_variance)] = 0 updated_variance[np.isinf(updated_variance)] = 0 - # return vector only when required - if (updated_sample_count[0] == updated_sample_count).all(): - updated_sample_count = updated_sample_count[0] return updated_mean, updated_variance, updated_sample_count diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 4d2f0d4858745..32585d58efa75 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -506,7 +506,7 @@ def test_incremental_variance_update_formulas(): old_sample_count = X1.shape[0] final_means, final_variances, final_count = \ _incremental_mean_and_var(X2, old_means, old_variances, - old_sample_count) + old_sample_count * np.ones(X2.shape[1])) assert_almost_equal(final_means, A.mean(axis=0), 6) assert_almost_equal(final_variances, A.var(axis=0), 6) assert_almost_equal(final_count, A.shape[0]) @@ -585,7 +585,8 @@ def naive_mean_variance_update(x, last_mean, last_variance, for i in range(A1.shape[0]): mean, var, n = \ _incremental_mean_and_var(A1[i, :].reshape((1, A1.shape[1])), - mean, var, n) + mean, var, n * np.ones(A1.shape[1])) + n = n[0] assert_equal(n, A.shape[0]) assert_array_almost_equal(A.mean(axis=0), mean) assert_greater(tol, np.abs(stable_var(A) - var).max()) @@ -612,9 +613,10 @@ def test_incremental_variance_ddof(): else: result = _incremental_mean_and_var( batch, incremental_means, incremental_variances, - sample_count) + sample_count * np.ones(batch.shape[1])) (incremental_means, incremental_variances, incremental_count) = result + incremental_count = incremental_count[0] sample_count += batch.shape[0] calculated_means = np.mean(X[:j], axis=0) From adced1df8bf6597feb386c7d60b2841cad90868a Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 18 Feb 2018 14:30:14 +0530 Subject: [PATCH 20/23] modified csr_matrix and csc_matrix to be able to handle NaN values --- sklearn/preprocessing/data.py | 6 ++- sklearn/utils/estimator_checks.py | 3 ++ sklearn/utils/sparsefuncs_fast.pyx | 65 +++++++++++++++++++++++------- 3 files changed, 58 insertions(+), 16 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index bbd2fae10c0ec..a43bfba73622d 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -619,7 +619,8 @@ def partial_fit(self, X, y=None): Ignored """ X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, - warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) + warn_on_dtype=True, estimator=self, + force_all_finite='allow-nan', dtype=FLOAT_DTYPES) # Even in the case of `with_mean=False`, we update the mean anyway # This is needed for the incremental computation of the var @@ -646,6 +647,9 @@ def partial_fit(self, X, y=None): self.mean_ = None self.var_ = None else: + X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, + warn_on_dtype=True, estimator=self, + dtype=FLOAT_DTYPES) # First pass if not hasattr(self, 'n_samples_seen_'): self.mean_ = .0 diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index b079c37f7bea2..9add53f720b46 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -70,6 +70,7 @@ 'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression', 'RANSACRegressor', 'RadiusNeighborsRegressor', 'RandomForestRegressor', 'Ridge', 'RidgeCV'] +ALLOW_NAN = ['StandardScaler'] def _yield_non_meta_checks(name, estimator): @@ -1024,6 +1025,8 @@ def check_estimators_nan_inf(name, estimator_orig): error_string_transform = ("Estimator doesn't check for NaN and inf in" " transform.") for X_train in [X_train_nan, X_train_inf]: + if np.any(np.isnan(X_train)) and name in ALLOW_NAN: + continue # catch deprecation warnings with ignore_warnings(category=(DeprecationWarning, FutureWarning)): estimator = clone(estimator_orig) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index 481f2137fab77..a3fd379432d14 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -15,6 +15,8 @@ import numpy as np import scipy.sparse as sp cimport cython from cython cimport floating +from numpy.math cimport isnan +import warnings np.import_array() @@ -54,7 +56,7 @@ def _csr_row_norms(np.ndarray[floating, ndim=1, mode="c"] X_data, return norms -def csr_mean_variance_axis0(X): +def csr_mean_variance_axis0(X, ignore_nan=True): """Compute mean and variance along axis 0 on a CSR matrix Parameters @@ -74,12 +76,13 @@ def csr_mean_variance_axis0(X): """ if X.dtype != np.float32: X = X.astype(np.float64) - return _csr_mean_variance_axis0(X.data, X.shape, X.indices) + return _csr_mean_variance_axis0(X.data, X.shape, X.indices, ignore_nan) def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, shape, - np.ndarray[int, ndim=1] X_indices): + np.ndarray[int, ndim=1] X_indices, + ignore_nan=True): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef unsigned int n_samples = shape[0] @@ -94,6 +97,8 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, cdef np.ndarray[floating, ndim=1] means # variances[j] contains the variance of feature j cdef np.ndarray[floating, ndim=1] variances + # n_samples_feat[j] contains the n_samples of feature j + cdef np.ndarray[floating, ndim=1] n_samples_feat if floating is float: dtype = np.float32 @@ -102,6 +107,7 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, means = np.zeros(n_features, dtype=dtype) variances = np.zeros_like(means, dtype=dtype) + n_samples_feat = np.ones(n_features, dtype=dtype) * n_samples # counts[j] contains the number of samples where feature j is non-zero cdef np.ndarray[int, ndim=1] counts = np.zeros(n_features, @@ -109,24 +115,40 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, for i in xrange(non_zero): col_ind = X_indices[i] - means[col_ind] += X_data[i] + x_i = X_data[i] + if ignore_nan and isnan(x_i): + n_samples_feat[col_ind] -= 1 + continue + means[col_ind] += x_i - means /= n_samples + with warnings.catch_warnings(): + # as division by 0 might happen + warnings.simplefilter('ignore') + means /= n_samples_feat + means[np.isnan(means)] = 0 + means[np.isinf(means)] = 0 for i in xrange(non_zero): col_ind = X_indices[i] - diff = X_data[i] - means[col_ind] + x_i = X_data[i] + if ignore_nan and isnan(x_i): + continue + diff = x_i - means[col_ind] variances[col_ind] += diff * diff counts[col_ind] += 1 - for i in xrange(n_features): - variances[i] += (n_samples - counts[i]) * means[i] ** 2 - variances[i] /= n_samples + variances += (n_samples_feat - counts) * means ** 2 + with warnings.catch_warnings(): + # as division by 0 might happen + warnings.simplefilter('ignore') + variances /= n_samples_feat + variances[np.isnan(variances)] = 0 + variances[np.isinf(variances)] = 0 return means, variances -def csc_mean_variance_axis0(X): +def csc_mean_variance_axis0(X, ignore_nan=True): """Compute mean and variance along axis 0 on a CSC matrix Parameters @@ -146,13 +168,14 @@ def csc_mean_variance_axis0(X): """ if X.dtype != np.float32: X = X.astype(np.float64) - return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr) + return _csc_mean_variance_axis0(X.data, X.shape, X.indices, X.indptr, + ignore_nan) def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, shape, np.ndarray[int, ndim=1] X_indices, - np.ndarray[int, ndim=1] X_indptr): + np.ndarray[int, ndim=1] X_indptr, ignore_nan=True): # Implement the function here since variables using fused types # cannot be declared directly and can only be passed as function arguments cdef unsigned int n_samples = shape[0] @@ -164,6 +187,7 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, cdef unsigned int startptr cdef unsigned int endptr cdef floating diff + cdef floating n_samples_feat # means[j] contains the mean of feature j cdef np.ndarray[floating, ndim=1] means @@ -182,17 +206,28 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, startptr = X_indptr[i] endptr = X_indptr[i + 1] counts = endptr - startptr + n_samples_feat = n_samples for j in xrange(startptr, endptr): - means[i] += X_data[j] - means[i] /= n_samples + x_i = X_data[j] + if ignore_nan and isnan(x_i): + n_samples_feat -= 1 + continue + means[i] += x_i + if n_samples_feat != 0: + means[i] /= n_samples + else: + means[i] = 0 for j in xrange(startptr, endptr): diff = X_data[j] - means[i] variances[i] += diff * diff variances[i] += (n_samples - counts) * means[i] * means[i] - variances[i] /= n_samples + if n_samples_feat != 0: + variances[i] /= n_samples + else: + variances[i] = 0 return means, variances From a0e25e9cea12d1c245f6d9ded5f5506673770701 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 11 Mar 2018 14:31:21 +0530 Subject: [PATCH 21/23] made changes to optimize the code --- sklearn/preprocessing/tests/test_data.py | 6 ++--- sklearn/utils/extmath.py | 31 ++++++++++++------------ sklearn/utils/tests/test_extmath.py | 21 +++++++++------- 3 files changed, 30 insertions(+), 28 deletions(-) diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py index 5ce86d2927b0e..2b6d3c3d16890 100644 --- a/sklearn/preprocessing/tests/test_data.py +++ b/sklearn/preprocessing/tests/test_data.py @@ -203,7 +203,7 @@ def test_standard_scaler_1d(): np.zeros_like(n_features)) assert_array_almost_equal(X_scaled.mean(axis=0), .0) assert_array_almost_equal(X_scaled.std(axis=0), 1.) - assert_almost_equal(scaler.n_samples_seen_, X.shape[0]) + assert_array_equal(scaler.n_samples_seen_, X.shape[0]) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) @@ -283,7 +283,7 @@ def test_scaler_2d_arrays(): scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) - assert_almost_equal(scaler.n_samples_seen_, n_samples) + assert_array_equal(scaler.n_samples_seen_, n_samples) assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) @@ -517,7 +517,7 @@ def test_standard_scaler_trasform_with_partial_fit(): assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal assert_array_less(zero, scaler_incr.scale_ + epsilon) # (i+1) because the Scaler has been already fitted - assert_almost_equal((i + 1), scaler_incr.n_samples_seen_) + assert_array_equal((i + 1), scaler_incr.n_samples_seen_) def test_min_max_scaler_iris(): diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 1cadc849850d8..07525087967e9 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -643,7 +643,7 @@ def make_nonnegative(X, min_value=0): def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, - last_sample_count=0, ignore_nan=True): + last_sample_count=0, ignore_nan=False): """Calculate mean update and a Youngs and Cramer variance update. last_mean and last_variance are statistics computed at the last step by the @@ -696,34 +696,33 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_sample_count = np.sum(~np.isnan(X), axis=0) updated_sample_count = last_sample_count + new_sample_count - warnings.filterwarnings('ignore') # as division by 0 might happen - updated_mean = (last_sum + new_sum) / updated_sample_count - updated_mean[np.isinf(updated_mean)] = 0 - updated_mean[np.isnan(updated_mean)] = 0 + with np.errstate(divide ="ignore"): # as division by 0 might happen + updated_mean = (last_sum + new_sum) / updated_sample_count + updated_mean[np.logical_not(updated_sample_count)] = 0 if last_variance is None: updated_variance = None else: var_func = np.nanvar if ignore_nan else np.var new_unnormalized_variance = var_func(X, axis=0) - new_unnormalized_variance[np.isnan(new_unnormalized_variance)] = 0 + new_unnormalized_variance[~np.isfinite(new_unnormalized_variance)] = 0 new_unnormalized_variance *= new_sample_count - last_over_new_count = last_sample_count / new_sample_count last_unnormalized_variance = last_variance * last_sample_count - updated_unnormalized_variance = ( - last_over_new_count / updated_sample_count * - (last_sum / last_over_new_count - new_sum) ** 2) + with np.errstate(divide = "ignore"): + last_over_new_count = last_sample_count / new_sample_count + updated_unnormalized_variance = ( + last_over_new_count / updated_sample_count * + (last_sum / last_over_new_count - new_sum) ** 2) # updated_unnormalized_variance can be both NaN or Inf - updated_unnormalized_variance[np.isnan( - updated_unnormalized_variance)] = 0 - updated_unnormalized_variance[np.isinf( + updated_unnormalized_variance[~np.isfinite( updated_unnormalized_variance)] = 0 updated_unnormalized_variance += (last_unnormalized_variance + new_unnormalized_variance) - updated_variance = updated_unnormalized_variance / updated_sample_count + + with np.errstate(divide = "ignore"): + updated_variance = updated_unnormalized_variance / updated_sample_count # As division by Zero might happen - updated_variance[np.isnan(updated_variance)] = 0 - updated_variance[np.isinf(updated_variance)] = 0 + updated_variance[np.logical_not(updated_sample_count)] = 0 return updated_mean, updated_variance, updated_sample_count diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index 32585d58efa75..96d4fd9f6b36c 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -470,21 +470,24 @@ def naive_log_logistic(x): def test_incremental_mean_and_var_nan(): # Test mean and variance when an array has floating NaN values - A = np.array([[600, 470, 170, 430, np.nan], - [600, np.nan, 170, 430, 300], - [np.nan, np.nan, np.nan, np.nan, np.nan], - [np.nan, np.nan, np.nan, np.nan, np.nan]]) - X1 = A[:3, :] - X2 = A[3:, :] + + X1 = np.array([[600, 470, 170, 430, np.nan], + [600, np.nan, 170, 430, 300], + [np.nan, np.nan, np.nan, np.nan, np.nan], + [600, 470, 170, 430, 300]]) + + X2 = np.array([[np.nan, np.nan, np.nan, np.nan, np.nan], + [600, 470, 170, 430, 300]]) + A = np.concatenate((X1, X2,), axis=0) X_means, X_variances, X_count = _incremental_mean_and_var( X1, np.array([0, 0, 0, 0, 0]), np.array([0, 0, 0, 0, 0]), - np.array([0, 0, 0, 0, 0])) + np.array([0, 0, 0, 0, 0]), ignore_nan=True) A_means = np.nanmean(A, axis=0) A_variances = np.nanvar(A, axis=0) - A_count = np.array([2, 1, 2, 2, 1]) + A_count = np.array([4, 3, 4, 4, 3]) final_means, final_variances, final_count = _incremental_mean_and_var( - X2, X_means, X_variances, X_count) + X2, X_means, X_variances, X_count, ignore_nan=True) assert_almost_equal(A_means, final_means) assert_almost_equal(A_variances, final_variances) assert_almost_equal(A_count, final_count) From e5246311b33a069f5d1f9e3b457b2051250ebd66 Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Sun, 11 Mar 2018 14:35:29 +0530 Subject: [PATCH 22/23] remove pep8 errors --- sklearn/utils/extmath.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/utils/extmath.py b/sklearn/utils/extmath.py index 07525087967e9..7b96e4d0a1f10 100644 --- a/sklearn/utils/extmath.py +++ b/sklearn/utils/extmath.py @@ -696,7 +696,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_sample_count = np.sum(~np.isnan(X), axis=0) updated_sample_count = last_sample_count + new_sample_count - with np.errstate(divide ="ignore"): # as division by 0 might happen + with np.errstate(divide="ignore"): # as division by 0 might happen updated_mean = (last_sum + new_sum) / updated_sample_count updated_mean[np.logical_not(updated_sample_count)] = 0 @@ -708,7 +708,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, new_unnormalized_variance[~np.isfinite(new_unnormalized_variance)] = 0 new_unnormalized_variance *= new_sample_count last_unnormalized_variance = last_variance * last_sample_count - with np.errstate(divide = "ignore"): + with np.errstate(divide="ignore"): last_over_new_count = last_sample_count / new_sample_count updated_unnormalized_variance = ( last_over_new_count / updated_sample_count * @@ -719,8 +719,9 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, updated_unnormalized_variance += (last_unnormalized_variance + new_unnormalized_variance) - with np.errstate(divide = "ignore"): - updated_variance = updated_unnormalized_variance / updated_sample_count + with np.errstate(divide="ignore"): + updated_variance = (updated_unnormalized_variance / + updated_sample_count) # As division by Zero might happen updated_variance[np.logical_not(updated_sample_count)] = 0 From dc151a67e94cc613e638a06c9a2833dbc80fbe5a Mon Sep 17 00:00:00 2001 From: Pinaki Nath Chowdhury Date: Mon, 12 Mar 2018 01:10:16 +0530 Subject: [PATCH 23/23] corrected csr_mean_variance_axis0 and csc_mean_variance_axis0 --- sklearn/utils/sparsefuncs_fast.pyx | 28 ++++++++++++------------- sklearn/utils/tests/test_sparsefuncs.py | 18 +++++++++++++++- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx index a3fd379432d14..0bd2f5150da9e 100644 --- a/sklearn/utils/sparsefuncs_fast.pyx +++ b/sklearn/utils/sparsefuncs_fast.pyx @@ -121,12 +121,10 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, continue means[col_ind] += x_i - with warnings.catch_warnings(): + with np.errstate(divide="ignore"): # as division by 0 might happen - warnings.simplefilter('ignore') means /= n_samples_feat - means[np.isnan(means)] = 0 - means[np.isinf(means)] = 0 + means[np.logical_not(n_samples_feat)] = 0 for i in xrange(non_zero): col_ind = X_indices[i] @@ -138,12 +136,10 @@ def _csr_mean_variance_axis0(np.ndarray[floating, ndim=1, mode="c"] X_data, counts[col_ind] += 1 variances += (n_samples_feat - counts) * means ** 2 - with warnings.catch_warnings(): + with np.errstate(divide="ignore"): # as division by 0 might happen - warnings.simplefilter('ignore') variances /= n_samples_feat - variances[np.isnan(variances)] = 0 - variances[np.isinf(variances)] = 0 + variances[np.logical_not(n_samples_feat)] = 0 return means, variances @@ -209,23 +205,27 @@ def _csc_mean_variance_axis0(np.ndarray[floating, ndim=1] X_data, n_samples_feat = n_samples for j in xrange(startptr, endptr): - x_i = X_data[j] - if ignore_nan and isnan(x_i): + x_j = X_data[j] + if ignore_nan and isnan(x_j): n_samples_feat -= 1 continue - means[i] += x_i + means[i] += x_j + if n_samples_feat != 0: - means[i] /= n_samples + means[i] /= n_samples_feat else: means[i] = 0 for j in xrange(startptr, endptr): - diff = X_data[j] - means[i] + x_j = X_data[j] + if ignore_nan and isnan(x_j): + continue; + diff = x_j - means[i] variances[i] += diff * diff variances[i] += (n_samples - counts) * means[i] * means[i] if n_samples_feat != 0: - variances[i] /= n_samples + variances[i] /= n_samples_feat else: variances[i] = 0 diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py index f2b35e7459833..54516e797d19a 100644 --- a/sklearn/utils/tests/test_sparsefuncs.py +++ b/sklearn/utils/tests/test_sparsefuncs.py @@ -17,10 +17,26 @@ count_nonzero, csc_median_axis_0) from sklearn.utils.sparsefuncs_fast import (assign_rows_csr, inplace_csr_row_normalize_l1, - inplace_csr_row_normalize_l2) + inplace_csr_row_normalize_l2, + csr_mean_variance_axis0, + csc_mean_variance_axis0) from sklearn.utils.testing import assert_raises +def test_csr_csc_mean_axis0(): + X = np.array([[600, np.nan, 0, 0, np.nan], + [np.nan, 0, np.nan, np.nan, 0], + [600, np.nan, 0, 0, np.nan]]) + X_csr = sp.csr_matrix(X) + X_means, X_variance = csr_mean_variance_axis0(X_csr) + assert_array_almost_equal(X_means, np.array([600, 0, 0, 0, 0])) + assert_array_almost_equal(X_variance, np.array([0, 0, 0, 0, 0])) + X_csc = sp.csc_matrix(X) + X_means, X_variance = csc_mean_variance_axis0(X_csc) + assert_array_almost_equal(X_means, np.array([600, 0, 0, 0, 0])) + assert_array_almost_equal(X_variance, np.array([0, 0, 0, 0, 0])) + + def test_mean_variance_axis0(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit