@@ -1002,3 +1002,165 @@ def fit(self, X, y=None):
1002
1002
"""
1003
1003
self .fit_transform (X )
1004
1004
return self
1005
+
1006
+
1007
+ class SamplingImputer (BaseEstimator , TransformerMixin ):
1008
+ """Imputation transformer for completing missing values.
1009
+
1010
+ Read more in the :ref:`User Guide <impute>`.
1011
+
1012
+ Parameters
1013
+ ----------
1014
+ missing_values : number, string, np.nan (default) or None
1015
+ The placeholder for the missing values. All occurrences of
1016
+ `missing_values` will be imputed.
1017
+
1018
+ verbose : integer, optional (default=0)
1019
+ Controls the verbosity of the imputer.
1020
+
1021
+ copy : boolean, optional (default=True)
1022
+ If True, a copy of X will be created. If False, imputation will
1023
+ be done in-place whenever possible. Note that, in the following cases,
1024
+ a new copy will always be made, even if `copy=False`:
1025
+
1026
+ - If X is not an array of floating values;
1027
+ - If X is sparse and `missing_values=0`;
1028
+ - If X is encoded as a CSR matrix.
1029
+
1030
+ random_state : int, RandomState instance or None, optional (default=None)
1031
+ The seed of the pseudo random number generator to use when shuffling
1032
+ the data. If int, random_state is the seed used by the random number
1033
+ generator; If RandomState instance, random_state is the random number
1034
+ generator; If None, the random number generator is the RandomState
1035
+ instance used by ``np.random``.
1036
+
1037
+ Attributes
1038
+ ----------
1039
+ generators_ : array of shape (n_features,)
1040
+ The number generator to impute missing values with values drawn
1041
+ uniformly at random from the non-missing values of each feature.
1042
+
1043
+ Notes
1044
+ -----
1045
+ Columns which only contained missing values at `fit` are discarded upon
1046
+ `transform`.
1047
+
1048
+ """
1049
+ def __init__ (self , missing_values = np .nan ,
1050
+ verbose = 0 , copy = True , random_state = None ):
1051
+ self .missing_values = missing_values
1052
+ self .verbose = verbose
1053
+ self .copy = copy
1054
+ self .random_state = random_state
1055
+
1056
+ def _validate_input (self , X ):
1057
+ if not is_scalar_nan (self .missing_values ):
1058
+ force_all_finite = True
1059
+ else :
1060
+ force_all_finite = "allow-nan"
1061
+
1062
+ return check_array (X , accept_sparse = 'csc' , dtype = None ,
1063
+ force_all_finite = force_all_finite , copy = self .copy )
1064
+
1065
+ def fit (self , X , y = None ):
1066
+ """Fit the imputer on X.
1067
+
1068
+ Parameters
1069
+ ----------
1070
+ X : {array-like, sparse matrix}, shape (n_samples, n_features)
1071
+ Input data, where ``n_samples`` is the number of samples and
1072
+ ``n_features`` is the number of features.
1073
+
1074
+ Returns
1075
+ -------
1076
+ self : SamplingImputer
1077
+ """
1078
+ self .random_state_ = getattr (self , "random_state_" ,
1079
+ check_random_state (self .random_state ))
1080
+
1081
+ X = self ._validate_input (X )
1082
+
1083
+ if sparse .issparse (X ):
1084
+ self .generators_ = self ._sparse_fit (X ,
1085
+ self .missing_values ,
1086
+ self .random_state_ )
1087
+ else :
1088
+ self .generators_ = self ._dense_fit (X ,
1089
+ self .missing_values ,
1090
+ self .random_state_ )
1091
+
1092
+ return self
1093
+
1094
+ def _sparse_fit (self , X , missing_values , random_state ):
1095
+ """Fit the transformer on sparse data."""
1096
+ todo = True
1097
+
1098
+ def _dense_fit (self , X , missing_values , random_state ):
1099
+ """Fit the transformer on dense data."""
1100
+ mask = _get_mask (X , missing_values )
1101
+ mask = np .logical_not (mask )
1102
+
1103
+ X = X .transpose ()
1104
+ mask = mask .transpose ()
1105
+
1106
+ generators = np .empty (X .shape [0 ], dtype = object )
1107
+
1108
+ for i , (row , row_mask ) in enumerate (zip (X [:], mask [:])):
1109
+ row = row [row_mask ]
1110
+ if row .size > 0 :
1111
+ uniques , counts = np .unique (row , return_counts = True )
1112
+ probas = counts / counts .sum ()
1113
+ g = lambda k : random_state .choice (uniques , k , p = probas )
1114
+ generators [i ] = g
1115
+ else :
1116
+ generators [i ] = None
1117
+
1118
+ return generators
1119
+
1120
+ def transform (self , X ):
1121
+ """Impute all missing values in X.
1122
+
1123
+ Parameters
1124
+ ----------
1125
+ X : {array-like, sparse matrix}, shape = [n_samples, n_features]
1126
+ The input data to complete.
1127
+ """
1128
+ check_is_fitted (self , 'generators_' )
1129
+
1130
+ X = self ._validate_input (X )
1131
+
1132
+ generators = self .generators_
1133
+
1134
+ if X .shape [1 ] != generators .shape [0 ]:
1135
+ raise ValueError ("X has %d features per sample, expected %d"
1136
+ % (X .shape [1 ], self .statistics_ .shape [0 ]))
1137
+
1138
+ # Delete the invalid columns
1139
+ invalid_mask = _get_mask (generators , None )
1140
+ valid_mask = np .logical_not (invalid_mask )
1141
+ valid_generators = generators [valid_mask ]
1142
+ valid_generators_indexes = np .flatnonzero (valid_mask )
1143
+
1144
+ if invalid_mask .any ():
1145
+ missing = np .arange (X .shape [1 ])[invalid_mask ]
1146
+ if self .verbose :
1147
+ warnings .warn ("Deleting features without "
1148
+ "observed values: %s" % missing )
1149
+ X = X [:, valid_generators_indexes ]
1150
+
1151
+ # Do actual imputation
1152
+ if sparse .issparse (X ) and self .missing_values != 0 :
1153
+ todo = True
1154
+
1155
+ else :
1156
+ if sparse .issparse (X ):
1157
+ X = X .toarray ()
1158
+
1159
+ mask = _get_mask (X , self .missing_values )
1160
+ n_missing = np .sum (mask , axis = 0 )
1161
+ for i in range (n_missing .shape [0 ]):
1162
+ values = generators [i ](n_missing [i ])
1163
+ coordinates = np .nonzero (mask [:, i ])
1164
+ X [coordinates , i ] = values
1165
+
1166
+ return X
0 commit comments