|
35 | 35 | 'predictor'])
|
36 | 36 |
|
37 | 37 | __all__ = [
|
| 38 | + 'MissingIndicator', |
38 | 39 | 'SimpleImputer',
|
39 | 40 | 'ChainedImputer',
|
40 | 41 | ]
|
@@ -975,3 +976,225 @@ def fit(self, X, y=None):
|
975 | 976 | """
|
976 | 977 | self.fit_transform(X)
|
977 | 978 | return self
|
| 979 | + |
| 980 | + |
| 981 | +class MissingIndicator(BaseEstimator, TransformerMixin): |
| 982 | + """Binary indicators for missing values. |
| 983 | +
|
| 984 | + Parameters |
| 985 | + ---------- |
| 986 | + missing_values : number, string, np.nan (default) or None |
| 987 | + The placeholder for the missing values. All occurrences of |
| 988 | + `missing_values` will be imputed. |
| 989 | +
|
| 990 | + features : str, optional |
| 991 | + Whether the imputer mask should represent all or a subset of |
| 992 | + features. |
| 993 | +
|
| 994 | + - If "missing-only" (default), the imputer mask will only represent |
| 995 | + features containing missing values during fit time. |
| 996 | + - If "all", the imputer mask will represent all features. |
| 997 | +
|
| 998 | + sparse : boolean or "auto", optional |
| 999 | + Whether the imputer mask format should be sparse or dense. |
| 1000 | +
|
| 1001 | + - If "auto" (default), the imputer mask will be of same type as |
| 1002 | + input. |
| 1003 | + - If True, the imputer mask will be a sparse matrix. |
| 1004 | + - If False, the imputer mask will be a numpy array. |
| 1005 | +
|
| 1006 | + error_on_new : boolean, optional |
| 1007 | + If True (default), transform will raise an error when there are |
| 1008 | + features with missing values in transform that have no missing values |
| 1009 | + in fit This is applicable only when ``features="missing-only"``. |
| 1010 | +
|
| 1011 | + Attributes |
| 1012 | + ---------- |
| 1013 | + features_ : ndarray, shape (n_missing_features,) or (n_features,) |
| 1014 | + The features indices which will be returned when calling ``transform``. |
| 1015 | + They are computed during ``fit``. For ``features='all'``, it is |
| 1016 | + to ``range(n_features)``. |
| 1017 | +
|
| 1018 | + Examples |
| 1019 | + -------- |
| 1020 | + >>> import numpy as np |
| 1021 | + >>> from sklearn.impute import MissingIndicator |
| 1022 | + >>> X1 = np.array([[np.nan, 1, 3], |
| 1023 | + ... [4, 0, np.nan], |
| 1024 | + ... [8, 1, 0]]) |
| 1025 | + >>> X2 = np.array([[5, 1, np.nan], |
| 1026 | + ... [np.nan, 2, 3], |
| 1027 | + ... [2, 4, 0]]) |
| 1028 | + >>> indicator = MissingIndicator() |
| 1029 | + >>> indicator.fit(X1) |
| 1030 | + MissingIndicator(error_on_new=True, features='missing-only', |
| 1031 | + missing_values=nan, sparse='auto') |
| 1032 | + >>> X2_tr = indicator.transform(X2) |
| 1033 | + >>> X2_tr |
| 1034 | + array([[False, True], |
| 1035 | + [ True, False], |
| 1036 | + [False, False]]) |
| 1037 | +
|
| 1038 | + """ |
| 1039 | + |
| 1040 | + def __init__(self, missing_values=np.nan, features="missing-only", |
| 1041 | + sparse="auto", error_on_new=True): |
| 1042 | + self.missing_values = missing_values |
| 1043 | + self.features = features |
| 1044 | + self.sparse = sparse |
| 1045 | + self.error_on_new = error_on_new |
| 1046 | + |
| 1047 | + def _get_missing_features_info(self, X): |
| 1048 | + """Compute the imputer mask and the indices of the features |
| 1049 | + containing missing values. |
| 1050 | +
|
| 1051 | + Parameters |
| 1052 | + ---------- |
| 1053 | + X : {ndarray or sparse matrix}, shape (n_samples, n_features) |
| 1054 | + The input data with missing values. Note that ``X`` has been |
| 1055 | + checked in ``fit`` and ``transform`` before to call this function. |
| 1056 | +
|
| 1057 | + Returns |
| 1058 | + ------- |
| 1059 | + imputer_mask : {ndarray or sparse matrix}, shape \ |
| 1060 | +(n_samples, n_features) or (n_samples, n_features_with_missing) |
| 1061 | + The imputer mask of the original data. |
| 1062 | +
|
| 1063 | + features_with_missing : ndarray, shape (n_features_with_missing) |
| 1064 | + The features containing missing values. |
| 1065 | +
|
| 1066 | + """ |
| 1067 | + if sparse.issparse(X) and self.missing_values != 0: |
| 1068 | + mask = _get_mask(X.data, self.missing_values) |
| 1069 | + |
| 1070 | + # The imputer mask will be constructed with the same sparse format |
| 1071 | + # as X. |
| 1072 | + sparse_constructor = (sparse.csr_matrix if X.format == 'csr' |
| 1073 | + else sparse.csc_matrix) |
| 1074 | + imputer_mask = sparse_constructor( |
| 1075 | + (mask, X.indices.copy(), X.indptr.copy()), |
| 1076 | + shape=X.shape, dtype=bool) |
| 1077 | + |
| 1078 | + missing_values_mask = imputer_mask.copy() |
| 1079 | + missing_values_mask.eliminate_zeros() |
| 1080 | + features_with_missing = ( |
| 1081 | + np.flatnonzero(np.diff(missing_values_mask.indptr)) |
| 1082 | + if missing_values_mask.format == 'csc' |
| 1083 | + else np.unique(missing_values_mask.indices)) |
| 1084 | + |
| 1085 | + if self.sparse is False: |
| 1086 | + imputer_mask = imputer_mask.toarray() |
| 1087 | + elif imputer_mask.format == 'csr': |
| 1088 | + imputer_mask = imputer_mask.tocsc() |
| 1089 | + else: |
| 1090 | + if sparse.issparse(X): |
| 1091 | + # case of sparse matrix with 0 as missing values. Implicit and |
| 1092 | + # explicit zeros are considered as missing values. |
| 1093 | + X = X.toarray() |
| 1094 | + imputer_mask = _get_mask(X, self.missing_values) |
| 1095 | + features_with_missing = np.flatnonzero(imputer_mask.sum(axis=0)) |
| 1096 | + |
| 1097 | + if self.sparse is True: |
| 1098 | + imputer_mask = sparse.csc_matrix(imputer_mask) |
| 1099 | + |
| 1100 | + return imputer_mask, features_with_missing |
| 1101 | + |
| 1102 | + def fit(self, X, y=None): |
| 1103 | + """Fit the transformer on X. |
| 1104 | +
|
| 1105 | + Parameters |
| 1106 | + ---------- |
| 1107 | + X : {array-like, sparse matrix}, shape (n_samples, n_features) |
| 1108 | + Input data, where ``n_samples`` is the number of samples and |
| 1109 | + ``n_features`` is the number of features. |
| 1110 | +
|
| 1111 | + Returns |
| 1112 | + ------- |
| 1113 | + self : object |
| 1114 | + Returns self. |
| 1115 | + """ |
| 1116 | + if not is_scalar_nan(self.missing_values): |
| 1117 | + force_all_finite = True |
| 1118 | + else: |
| 1119 | + force_all_finite = "allow-nan" |
| 1120 | + X = check_array(X, accept_sparse=('csc', 'csr'), |
| 1121 | + force_all_finite=force_all_finite) |
| 1122 | + _check_inputs_dtype(X, self.missing_values) |
| 1123 | + |
| 1124 | + self._n_features = X.shape[1] |
| 1125 | + |
| 1126 | + if self.features not in ('missing-only', 'all'): |
| 1127 | + raise ValueError("'features' has to be either 'missing-only' or " |
| 1128 | + "'all'. Got {} instead.".format(self.features)) |
| 1129 | + |
| 1130 | + if not ((isinstance(self.sparse, six.string_types) and |
| 1131 | + self.sparse == "auto") or isinstance(self.sparse, bool)): |
| 1132 | + raise ValueError("'sparse' has to be a boolean or 'auto'. " |
| 1133 | + "Got {!r} instead.".format(self.sparse)) |
| 1134 | + |
| 1135 | + self.features_ = (self._get_missing_features_info(X)[1] |
| 1136 | + if self.features == 'missing-only' |
| 1137 | + else np.arange(self._n_features)) |
| 1138 | + |
| 1139 | + return self |
| 1140 | + |
| 1141 | + def transform(self, X): |
| 1142 | + """Generate missing values indicator for X. |
| 1143 | +
|
| 1144 | + Parameters |
| 1145 | + ---------- |
| 1146 | + X : {array-like, sparse matrix}, shape (n_samples, n_features) |
| 1147 | + The input data to complete. |
| 1148 | +
|
| 1149 | + Returns |
| 1150 | + ------- |
| 1151 | + Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) |
| 1152 | + The missing indicator for input data. The data type of ``Xt`` |
| 1153 | + will be boolean. |
| 1154 | +
|
| 1155 | + """ |
| 1156 | + check_is_fitted(self, "features_") |
| 1157 | + |
| 1158 | + if not is_scalar_nan(self.missing_values): |
| 1159 | + force_all_finite = True |
| 1160 | + else: |
| 1161 | + force_all_finite = "allow-nan" |
| 1162 | + X = check_array(X, accept_sparse=('csc', 'csr'), |
| 1163 | + force_all_finite=force_all_finite) |
| 1164 | + _check_inputs_dtype(X, self.missing_values) |
| 1165 | + |
| 1166 | + if X.shape[1] != self._n_features: |
| 1167 | + raise ValueError("X has a different number of features " |
| 1168 | + "than during fitting.") |
| 1169 | + |
| 1170 | + imputer_mask, features = self._get_missing_features_info(X) |
| 1171 | + |
| 1172 | + if self.features == "missing-only": |
| 1173 | + features_diff_fit_trans = np.setdiff1d(features, self.features_) |
| 1174 | + if (self.error_on_new and features_diff_fit_trans.size > 0): |
| 1175 | + raise ValueError("The features {} have missing values " |
| 1176 | + "in transform but have no missing values " |
| 1177 | + "in fit.".format(features_diff_fit_trans)) |
| 1178 | + |
| 1179 | + if (self.features_.size > 0 and |
| 1180 | + self.features_.size < self._n_features): |
| 1181 | + imputer_mask = imputer_mask[:, self.features_] |
| 1182 | + |
| 1183 | + return imputer_mask |
| 1184 | + |
| 1185 | + def fit_transform(self, X, y=None): |
| 1186 | + """Generate missing values indicator for X. |
| 1187 | +
|
| 1188 | + Parameters |
| 1189 | + ---------- |
| 1190 | + X : {array-like, sparse matrix}, shape (n_samples, n_features) |
| 1191 | + The input data to complete. |
| 1192 | +
|
| 1193 | + Returns |
| 1194 | + ------- |
| 1195 | + Xt : {ndarray or sparse matrix}, shape (n_samples, n_features) |
| 1196 | + The missing indicator for input data. The data type of ``Xt`` |
| 1197 | + will be boolean. |
| 1198 | +
|
| 1199 | + """ |
| 1200 | + return self.fit(X, y).transform(X) |
0 commit comments