-
-
Notifications
You must be signed in to change notification settings - Fork 26k
increment_mean_and_var can now handle NaN values #10618
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8cf9dbd
f7d7381
e3421e9
f924401
c812be9
4758bc9
df954ac
75d221b
a3bb041
8db4311
1f115f1
566ad15
499133d
032b058
e6c0521
7810d6e
be05d16
87667ab
8e41081
adced1d
c380bc7
a0e25e9
e524631
d8c99c2
dc151a6
512ca7b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -619,13 +619,19 @@ def partial_fit(self, X, y=None): | |
Ignored | ||
""" | ||
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, | ||
warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES) | ||
warn_on_dtype=True, estimator=self, | ||
force_all_finite='allow-nan', dtype=FLOAT_DTYPES) | ||
|
||
# Even in the case of `with_mean=False`, we update the mean anyway | ||
# This is needed for the incremental computation of the var | ||
# See incr_mean_variance_axis and _incremental_mean_variance_axis | ||
|
||
if sparse.issparse(X): | ||
# FIXME: remove this check statement | ||
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, | ||
warn_on_dtype=True, estimator=self, | ||
dtype=FLOAT_DTYPES) | ||
|
||
if self.with_mean: | ||
raise ValueError( | ||
"Cannot center sparse matrices: pass `with_mean=False` " | ||
|
@@ -646,6 +652,9 @@ def partial_fit(self, X, y=None): | |
self.mean_ = None | ||
self.var_ = None | ||
else: | ||
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy, | ||
warn_on_dtype=True, estimator=self, | ||
dtype=FLOAT_DTYPES) | ||
# First pass | ||
if not hasattr(self, 'n_samples_seen_'): | ||
self.mean_ = .0 | ||
|
@@ -656,8 +665,9 @@ def partial_fit(self, X, y=None): | |
self.var_ = None | ||
|
||
self.mean_, self.var_, self.n_samples_seen_ = \ | ||
_incremental_mean_and_var(X, self.mean_, self.var_, | ||
self.n_samples_seen_) | ||
_incremental_mean_and_var( | ||
X, self.mean_, self.var_, | ||
self.n_samples_seen_ * np.ones(X.shape[1])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to keep For example, you might compress the updated count to a scalar if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get it. From the changes, I though that
I thought that it would be easier to change only array from now on. Only There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jnothman @glemaitre Sorry for being inactive for the past week. Shall I keep There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMO it would be best for backwards compatibility to keep a scalar in the case when there are no NaNs or - for simplicity - in the case when all n_samples_seen are equal There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fair enough There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jnothman @glemaitre I'll make them generalised i.e. if all n_samples_seen are equal, it will return a scalar instead of a vector. |
||
|
||
if self.with_std: | ||
self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_)) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -70,6 +70,7 @@ | |
'OrthogonalMatchingPursuit', 'PLSCanonical', 'PLSRegression', | ||
'RANSACRegressor', 'RadiusNeighborsRegressor', | ||
'RandomForestRegressor', 'Ridge', 'RidgeCV'] | ||
ALLOW_NAN = ['StandardScaler'] | ||
|
||
|
||
def _yield_non_meta_checks(name, estimator): | ||
|
@@ -1024,6 +1025,8 @@ def check_estimators_nan_inf(name, estimator_orig): | |
error_string_transform = ("Estimator doesn't check for NaN and inf in" | ||
" transform.") | ||
for X_train in [X_train_nan, X_train_inf]: | ||
if np.any(np.isnan(X_train)) and name in ALLOW_NAN: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as: #10437 (comment) |
||
continue | ||
# catch deprecation warnings | ||
with ignore_warnings(category=(DeprecationWarning, FutureWarning)): | ||
estimator = clone(estimator_orig) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -643,7 +643,7 @@ def make_nonnegative(X, min_value=0): | |
|
||
|
||
def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, | ||
last_sample_count=0): | ||
last_sample_count=0, ignore_nan=False): | ||
"""Calculate mean update and a Youngs and Cramer variance update. | ||
|
||
last_mean and last_variance are statistics computed at the last step by the | ||
|
@@ -664,7 +664,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, | |
|
||
last_variance : array-like, shape: (n_features,) | ||
|
||
last_sample_count : int | ||
last_sample_count : array-like, shape: (n_features,) | ||
|
||
Returns | ||
------- | ||
|
@@ -673,7 +673,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, | |
updated_variance : array, shape (n_features,) | ||
If None, only mean is computed | ||
|
||
updated_sample_count : int | ||
updated_sample_count : array shape (n_features,) | ||
|
||
References | ||
---------- | ||
|
@@ -689,28 +689,41 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None, | |
# new = the current increment | ||
# updated = the aggregated stats | ||
last_sum = last_mean * last_sample_count | ||
new_sum = X.sum(axis=0) | ||
sum_func = np.nansum if ignore_nan else np.sum | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we should start with if not isnan(X.sum()): ignore_nan = False, and then use fast paths that don't involve triplicating the memory like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. most of the lines in this function needs to be modified to be able to ignore NaN values. Now, if we do not want to encounter
|
||
new_sum = sum_func(X, axis=0) | ||
new_sum[np.isnan(new_sum)] = 0 | ||
|
||
new_sample_count = X.shape[0] | ||
new_sample_count = np.sum(~np.isnan(X), axis=0) | ||
updated_sample_count = last_sample_count + new_sample_count | ||
|
||
updated_mean = (last_sum + new_sum) / updated_sample_count | ||
with np.errstate(divide="ignore"): # as division by 0 might happen | ||
updated_mean = (last_sum + new_sum) / updated_sample_count | ||
updated_mean[np.logical_not(updated_sample_count)] = 0 | ||
|
||
if last_variance is None: | ||
updated_variance = None | ||
else: | ||
new_unnormalized_variance = X.var(axis=0) * new_sample_count | ||
if last_sample_count == 0: # Avoid division by 0 | ||
updated_unnormalized_variance = new_unnormalized_variance | ||
else: | ||
var_func = np.nanvar if ignore_nan else np.var | ||
new_unnormalized_variance = var_func(X, axis=0) | ||
new_unnormalized_variance[~np.isfinite(new_unnormalized_variance)] = 0 | ||
new_unnormalized_variance *= new_sample_count | ||
last_unnormalized_variance = last_variance * last_sample_count | ||
with np.errstate(divide="ignore"): | ||
last_over_new_count = last_sample_count / new_sample_count | ||
last_unnormalized_variance = last_variance * last_sample_count | ||
updated_unnormalized_variance = ( | ||
last_unnormalized_variance + | ||
new_unnormalized_variance + | ||
last_over_new_count / updated_sample_count * | ||
(last_sum / last_over_new_count - new_sum) ** 2) | ||
updated_variance = updated_unnormalized_variance / updated_sample_count | ||
# updated_unnormalized_variance can be both NaN or Inf | ||
updated_unnormalized_variance[~np.isfinite( | ||
updated_unnormalized_variance)] = 0 | ||
updated_unnormalized_variance += (last_unnormalized_variance + | ||
new_unnormalized_variance) | ||
|
||
with np.errstate(divide="ignore"): | ||
updated_variance = (updated_unnormalized_variance / | ||
updated_sample_count) | ||
# As division by Zero might happen | ||
updated_variance[np.logical_not(updated_sample_count)] = 0 | ||
|
||
return updated_mean, updated_variance, updated_sample_count | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should not be supporting NaNs here. I think maybe we should allow it still to pass in a scalar n_samples_seen_ and
_incremental_mean_and_var
can broadcast it to n_features wide if appropriate.