Skip to content

Commit cbb0b19

Browse files
committed
add class SamplingImputer
1 parent 3b7305e commit cbb0b19

File tree

1 file changed

+162
-0
lines changed

1 file changed

+162
-0
lines changed

sklearn/impute.py

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,3 +1002,165 @@ def fit(self, X, y=None):
10021002
"""
10031003
self.fit_transform(X)
10041004
return self
1005+
1006+
1007+
class SamplingImputer(BaseEstimator, TransformerMixin):
1008+
"""Imputation transformer for completing missing values.
1009+
1010+
Read more in the :ref:`User Guide <impute>`.
1011+
1012+
Parameters
1013+
----------
1014+
missing_values : number, string, np.nan (default) or None
1015+
The placeholder for the missing values. All occurrences of
1016+
`missing_values` will be imputed.
1017+
1018+
verbose : integer, optional (default=0)
1019+
Controls the verbosity of the imputer.
1020+
1021+
copy : boolean, optional (default=True)
1022+
If True, a copy of X will be created. If False, imputation will
1023+
be done in-place whenever possible. Note that, in the following cases,
1024+
a new copy will always be made, even if `copy=False`:
1025+
1026+
- If X is not an array of floating values;
1027+
- If X is sparse and `missing_values=0`;
1028+
- If X is encoded as a CSR matrix.
1029+
1030+
random_state : int, RandomState instance or None, optional (default=None)
1031+
The seed of the pseudo random number generator to use when shuffling
1032+
the data. If int, random_state is the seed used by the random number
1033+
generator; If RandomState instance, random_state is the random number
1034+
generator; If None, the random number generator is the RandomState
1035+
instance used by ``np.random``.
1036+
1037+
Attributes
1038+
----------
1039+
generators_ : array of shape (n_features,)
1040+
The number generator to impute missing values with values drawn
1041+
uniformly at random from the non-missing values of each feature.
1042+
1043+
Notes
1044+
-----
1045+
Columns which only contained missing values at `fit` are discarded upon
1046+
`transform`.
1047+
1048+
"""
1049+
def __init__(self, missing_values=np.nan,
1050+
verbose=0, copy=True, random_state=None):
1051+
self.missing_values = missing_values
1052+
self.verbose = verbose
1053+
self.copy = copy
1054+
self.random_state = random_state
1055+
1056+
def _validate_input(self, X):
1057+
if not is_scalar_nan(self.missing_values):
1058+
force_all_finite = True
1059+
else:
1060+
force_all_finite = "allow-nan"
1061+
1062+
return check_array(X, accept_sparse='csc', dtype=None,
1063+
force_all_finite=force_all_finite, copy=self.copy)
1064+
1065+
def fit(self, X, y=None):
1066+
"""Fit the imputer on X.
1067+
1068+
Parameters
1069+
----------
1070+
X : {array-like, sparse matrix}, shape (n_samples, n_features)
1071+
Input data, where ``n_samples`` is the number of samples and
1072+
``n_features`` is the number of features.
1073+
1074+
Returns
1075+
-------
1076+
self : SamplingImputer
1077+
"""
1078+
self.random_state_ = getattr(self, "random_state_",
1079+
check_random_state(self.random_state))
1080+
1081+
X = self._validate_input(X)
1082+
1083+
if sparse.issparse(X):
1084+
self.generators_ = self._sparse_fit(X,
1085+
self.missing_values,
1086+
self.random_state_)
1087+
else:
1088+
self.generators_ = self._dense_fit(X,
1089+
self.missing_values,
1090+
self.random_state_)
1091+
1092+
return self
1093+
1094+
def _sparse_fit(self, X, missing_values, random_state):
1095+
"""Fit the transformer on sparse data."""
1096+
todo = True
1097+
1098+
def _dense_fit(self, X, missing_values, random_state):
1099+
"""Fit the transformer on dense data."""
1100+
mask = _get_mask(X, missing_values)
1101+
mask = np.logical_not(mask)
1102+
1103+
X = X.transpose()
1104+
mask = mask.transpose()
1105+
1106+
generators = np.empty(X.shape[0], dtype=object)
1107+
1108+
for i, (row, row_mask) in enumerate(zip(X[:], mask[:])):
1109+
row = row[row_mask]
1110+
if row.size > 0:
1111+
uniques, counts = np.unique(row, return_counts=True)
1112+
probas = counts / counts.sum()
1113+
g = lambda k: random_state.choice(uniques, k, p=probas)
1114+
generators[i] = g
1115+
else:
1116+
generators[i] = None
1117+
1118+
return generators
1119+
1120+
def transform(self, X):
1121+
"""Impute all missing values in X.
1122+
1123+
Parameters
1124+
----------
1125+
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
1126+
The input data to complete.
1127+
"""
1128+
check_is_fitted(self, 'generators_')
1129+
1130+
X = self._validate_input(X)
1131+
1132+
generators = self.generators_
1133+
1134+
if X.shape[1] != generators.shape[0]:
1135+
raise ValueError("X has %d features per sample, expected %d"
1136+
% (X.shape[1], self.statistics_.shape[0]))
1137+
1138+
# Delete the invalid columns
1139+
invalid_mask = _get_mask(generators, None)
1140+
valid_mask = np.logical_not(invalid_mask)
1141+
valid_generators = generators[valid_mask]
1142+
valid_generators_indexes = np.flatnonzero(valid_mask)
1143+
1144+
if invalid_mask.any():
1145+
missing = np.arange(X.shape[1])[invalid_mask]
1146+
if self.verbose:
1147+
warnings.warn("Deleting features without "
1148+
"observed values: %s" % missing)
1149+
X = X[:, valid_generators_indexes]
1150+
1151+
# Do actual imputation
1152+
if sparse.issparse(X) and self.missing_values != 0:
1153+
todo = True
1154+
1155+
else:
1156+
if sparse.issparse(X):
1157+
X = X.toarray()
1158+
1159+
mask = _get_mask(X, self.missing_values)
1160+
n_missing = np.sum(mask, axis=0)
1161+
for i in range(n_missing.shape[0]):
1162+
values = generators[i](n_missing[i])
1163+
coordinates = np.nonzero(mask[:, i])
1164+
X[coordinates, i] = values
1165+
1166+
return X

0 commit comments

Comments
 (0)