-
-
Notifications
You must be signed in to change notification settings - Fork 25.8k
[MRG] Add KNN strategy for imputation #4844
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ac8fdbd
de2182d
1608097
4ff1dd7
0b2cdf7
1b731cf
deb8c80
430a2a0
85039ac
998fba6
a64ee12
eacf3e8
c442bbb
5a2906c
b7ff8e1
3776dec
ef290f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,6 @@ | |
# License: BSD 3 clause | ||
|
||
import warnings | ||
|
||
import numpy as np | ||
import numpy.ma as ma | ||
from scipy import sparse | ||
|
@@ -14,7 +13,7 @@ | |
from ..utils.fixes import astype | ||
from ..utils.sparsefuncs import _get_median | ||
from ..utils.validation import check_is_fitted | ||
|
||
from ..utils import gen_batches | ||
from ..externals import six | ||
|
||
zip = six.moves.zip | ||
|
@@ -61,6 +60,7 @@ def _most_frequent(array, extra_value, n_repeat): | |
return extra_value | ||
|
||
|
||
|
||
class Imputer(BaseEstimator, TransformerMixin): | ||
"""Imputation transformer for completing missing values. | ||
|
||
|
@@ -82,12 +82,15 @@ class Imputer(BaseEstimator, TransformerMixin): | |
the axis. | ||
- If "most_frequent", then replace missing using the most frequent | ||
value along the axis. | ||
- If "knn", then replace missing using the mean of the k-nearest | ||
neighbors along the axis. Only samples with no missing values are | ||
considered as neighbors. | ||
|
||
axis : integer, optional (default=0) | ||
The axis along which to impute. | ||
|
||
- If `axis=0`, then impute along columns. | ||
- If `axis=1`, then impute along rows. | ||
- If ``axis=0``, then impute along columns. | ||
- If ``axis=1``, then impute along rows. | ||
|
||
verbose : integer, optional (default=0) | ||
Controls the verbosity of the imputer. | ||
|
@@ -99,13 +102,18 @@ class Imputer(BaseEstimator, TransformerMixin): | |
|
||
- If X is not an array of floating values; | ||
- If X is sparse and `missing_values=0`; | ||
- If `axis=0` and X is encoded as a CSR matrix; | ||
- If `axis=1` and X is encoded as a CSC matrix. | ||
- If ``axis=0`` and X is encoded as a CSR matrix; | ||
- If ``axis=1`` and X is encoded as a CSC matrix. | ||
|
||
n_neighbors : int, optional (default=1) | ||
Controls the number of nearest neighbors used to compute the mean | ||
along the axis. Only used when ``strategy=knn`` | ||
|
||
Attributes | ||
---------- | ||
statistics_ : array of shape (n_features,) | ||
The imputation fill value for each feature if axis == 0. | ||
If ``strategy=knn``, then it contains those samples having no missing value. | ||
|
||
Notes | ||
----- | ||
|
@@ -114,14 +122,16 @@ class Imputer(BaseEstimator, TransformerMixin): | |
- When ``axis=1``, an exception is raised if there are rows for which it is | ||
not possible to fill in the missing values (e.g., because they only | ||
contain missing values). | ||
- Knn strategy currently doesn't support sparse matrix. | ||
""" | ||
def __init__(self, missing_values="NaN", strategy="mean", | ||
axis=0, verbose=0, copy=True): | ||
axis=0, verbose=0, copy=True, n_neighbors=1): | ||
self.missing_values = missing_values | ||
self.strategy = strategy | ||
self.axis = axis | ||
self.verbose = verbose | ||
self.copy = copy | ||
self.n_neighbors = n_neighbors | ||
|
||
def fit(self, X, y=None): | ||
"""Fit the imputer on X. | ||
|
@@ -138,7 +148,7 @@ def fit(self, X, y=None): | |
Returns self. | ||
""" | ||
# Check parameters | ||
allowed_strategies = ["mean", "median", "most_frequent"] | ||
allowed_strategies = ["mean", "median", "most_frequent", "knn"] | ||
if self.strategy not in allowed_strategies: | ||
raise ValueError("Can only use these strategies: {0} " | ||
" got strategy={1}".format(allowed_strategies, | ||
|
@@ -248,6 +258,12 @@ def _sparse_fit(self, X, strategy, missing_values, axis): | |
|
||
return most_frequent | ||
|
||
# KNN | ||
elif strategy == "knn": | ||
raise ValueError("strategy='knn' does not support sparse " | ||
"matrix input") | ||
|
||
|
||
def _dense_fit(self, X, strategy, missing_values, axis): | ||
"""Fit the transformer on dense data.""" | ||
X = check_array(X, force_all_finite=False) | ||
|
@@ -299,6 +315,27 @@ def _dense_fit(self, X, strategy, missing_values, axis): | |
|
||
return most_frequent | ||
|
||
# KNN | ||
elif strategy == "knn": | ||
|
||
if axis == 1: | ||
X = X.transpose() | ||
mask = mask.transpose() | ||
|
||
# Get samples with complete features | ||
full_data = X[np.logical_not(mask.any(axis=1))] | ||
if full_data.size == 0: | ||
raise ValueError("There is no sample with complete data.") | ||
if full_data.shape[0] < self.n_neighbors: | ||
raise ValueError("There are only %d complete samples, " | ||
"but n_neighbors=%d." | ||
% (full_data.shape[0], self.n_neighbors)) | ||
# Transpose back | ||
if axis == 1: | ||
full_data = full_data.transpose() | ||
|
||
return full_data | ||
|
||
def transform(self, X): | ||
"""Impute all missing values in X. | ||
|
||
|
@@ -341,7 +378,9 @@ def transform(self, X): | |
valid_mask = np.logical_not(invalid_mask) | ||
valid_statistics = statistics[valid_mask] | ||
valid_statistics_indexes = np.where(valid_mask)[0] | ||
missing = np.arange(X.shape[not self.axis])[invalid_mask] | ||
|
||
if self.strategy != "knn": | ||
missing = np.arange(X.shape[not self.axis])[invalid_mask] | ||
|
||
if self.axis == 0 and invalid_mask.any(): | ||
if self.verbose: | ||
|
@@ -366,13 +405,53 @@ def transform(self, X): | |
|
||
mask = _get_mask(X, self.missing_values) | ||
n_missing = np.sum(mask, axis=self.axis) | ||
values = np.repeat(valid_statistics, n_missing) | ||
|
||
if self.axis == 0: | ||
coordinates = np.where(mask.transpose())[::-1] | ||
if self.strategy == 'knn': | ||
if self.axis == 1: | ||
X = X.transpose() | ||
mask = mask.transpose() | ||
statistics = statistics.transpose() | ||
|
||
batch_size = 1 # set batch size for block query | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can probably simplify the code if we stay with batch-size=1, right? |
||
|
||
missing_index = np.where(mask.any(axis=1))[0] | ||
D2 = np.empty_like(np.zeros([batch_size, statistics.shape[0], | ||
statistics.shape[1]])) | ||
|
||
# Preallocate output array for np.multiply(test1, test1, out=D2) | ||
for sl in gen_batches(len(missing_index), batch_size): | ||
X_sl = X[missing_index[sl]] | ||
mask_sl = mask[missing_index[sl]] | ||
X_sl[mask_sl] = np.nan | ||
impute_dist = X_sl[:][:, np.newaxis, :] - statistics | ||
|
||
# For the last slice, the length may not be the same | ||
# as batch_size | ||
if impute_dist.shape != D2.shape: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we have batch-size=1 this doesn't happen, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. |
||
D2 = np.empty_like(impute_dist) | ||
|
||
np.multiply(impute_dist, impute_dist, out=D2) | ||
D2[np.isnan(D2)] = 0 | ||
missing_row, missing_col = np.where(np.isnan(X_sl)) | ||
sqdist = D2.sum(axis=2) | ||
target_index = np.argsort(sqdist, axis=1)[:, :self.n_neighbors] | ||
means = np.mean(statistics[target_index], axis=1) | ||
X_sl[missing_row, missing_col] = means[np.where(np.isnan(X_sl))[0], | ||
missing_col] | ||
X[missing_index[sl]] = X_sl | ||
|
||
if self.axis == 1: | ||
X = X.transpose() | ||
|
||
else: | ||
coordinates = mask | ||
values = np.repeat(valid_statistics, n_missing) | ||
|
||
X[coordinates] = values | ||
if self.axis == 0: | ||
coordinates = np.where(mask.transpose())[::-1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is there a |
||
else: | ||
coordinates = mask | ||
|
||
X[coordinates] = values | ||
|
||
return X | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
any particular reason for this change?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now there is no missing_features, and every entry has a probability to be missing, which can be set at
th
.mask has same shape as dataset, and it has entry 'True' for those places that would be missing. So I think use X_missing[mask] is easier.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right. Sorry, I meant from zero to np.nan.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh I see. Because in original dataset there might be 0 entries, and if I label missing entries as 0, those entries will also be imputed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
makes sense.