@@ -225,13 +225,13 @@ class OneHotEncoder(_BaseEncoder):
225
225
(if any).
226
226
227
227
drop_idx_ : array of shape (n_features,)
228
- ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category to
229
- be dropped for each feature.
230
- ``drop_idx_[i] = -1 `` if no category is to be dropped from the feature
231
- with index ``i``, e.g. when `drop='if_binary'` and the feature isn't
232
- binary
233
-
234
- ``drop_idx_ = None`` if all the transformed features will be retained.
228
+ - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
229
+ to be dropped for each feature.
230
+ - ``drop_idx_[i] = None `` if no category is to be dropped from the
231
+ feature with index ``i``, e.g. when `drop='if_binary'` and the
232
+ feature isn't binary.
233
+ - ``drop_idx_ = None`` if all the transformed features will be
234
+ retained.
235
235
236
236
See Also
237
237
--------
@@ -316,10 +316,10 @@ def _compute_drop_idx(self):
316
316
return None
317
317
elif isinstance (self .drop , str ):
318
318
if self .drop == 'first' :
319
- return np .zeros (len (self .categories_ ), dtype = np .int_ )
319
+ return np .zeros (len (self .categories_ ), dtype = np .object )
320
320
elif self .drop == 'if_binary' :
321
- return np .array ([0 if len (cats ) == 2 else - 1
322
- for cats in self .categories_ ], dtype = np .int_ )
321
+ return np .array ([0 if len (cats ) == 2 else None
322
+ for cats in self .categories_ ], dtype = np .object )
323
323
else :
324
324
msg = (
325
325
"Wrong input for parameter `drop`. Expected "
@@ -354,7 +354,8 @@ def _compute_drop_idx(self):
354
354
raise ValueError (msg )
355
355
return np .array ([np .where (cat_list == val )[0 ][0 ]
356
356
for (val , cat_list ) in
357
- zip (self .drop , self .categories_ )], dtype = np .int_ )
357
+ zip (self .drop , self .categories_ )],
358
+ dtype = np .object )
358
359
359
360
def fit (self , X , y = None ):
360
361
"""
@@ -421,7 +422,7 @@ def transform(self, X):
421
422
422
423
n_samples , n_features = X_int .shape
423
424
424
- if self .drop is not None :
425
+ if self .drop_idx_ is not None :
425
426
to_drop = self .drop_idx_ .copy ()
426
427
# We remove all the dropped categories from mask, and decrement all
427
428
# categories that occur after them to avoid an empty column.
@@ -431,7 +432,7 @@ def transform(self, X):
431
432
n_cats = len (cats )
432
433
433
434
# drop='if_binary' but feature isn't binary
434
- if to_drop [i ] == - 1 :
435
+ if to_drop [i ] is None :
435
436
# set to cardinality to not drop from X_int
436
437
to_drop [i ] = n_cats
437
438
n_values .append (n_cats )
@@ -484,16 +485,14 @@ def inverse_transform(self, X):
484
485
485
486
n_samples , _ = X .shape
486
487
n_features = len (self .categories_ )
487
- if self .drop is None :
488
+ if self .drop_idx_ is None :
488
489
n_transformed_features = sum (len (cats )
489
490
for cats in self .categories_ )
490
- elif isinstance (self .drop , str ) and self .drop == 'if_binary' :
491
- n_transformed_features = sum (1 if len (cats ) == 2
492
- else len (cats )
493
- for cats in self .categories_ )
494
491
else :
495
- n_transformed_features = sum (len (cats ) - 1
496
- for cats in self .categories_ )
492
+ n_transformed_features = sum (
493
+ len (cats ) - 1 if to_drop is not None else len (cats )
494
+ for cats , to_drop in zip (self .categories_ , self .drop_idx_ )
495
+ )
497
496
498
497
# validate shape of passed X
499
498
msg = ("Shape of the passed X data is not correct. Expected {0} "
@@ -509,7 +508,7 @@ def inverse_transform(self, X):
509
508
found_unknown = {}
510
509
511
510
for i in range (n_features ):
512
- if self .drop is None :
511
+ if self .drop_idx_ is None or self . drop_idx_ [ i ] is None :
513
512
cats = self .categories_ [i ]
514
513
else :
515
514
cats = np .delete (self .categories_ [i ], self .drop_idx_ [i ])
@@ -532,9 +531,9 @@ def inverse_transform(self, X):
532
531
if unknown .any ():
533
532
found_unknown [i ] = unknown
534
533
# drop will either be None or handle_unknown will be error. If
535
- # self.drop is not None, then we can safely assume that all of
534
+ # self.drop_idx_ is not None, then we can safely assume that all of
536
535
# the nulls in each column are the dropped value
537
- elif self .drop is not None :
536
+ elif self .drop_idx_ is not None :
538
537
dropped = np .asarray (sub .sum (axis = 1 ) == 0 ).flatten ()
539
538
if dropped .any ():
540
539
X_tr [dropped , i ] = self .categories_ [i ][self .drop_idx_ [i ]]
@@ -581,7 +580,7 @@ def get_feature_names(self, input_features=None):
581
580
for i in range (len (cats )):
582
581
names = [
583
582
input_features [i ] + '_' + str (t ) for t in cats [i ]]
584
- if self .drop is not None :
583
+ if self .drop_idx_ is not None and self . drop_idx_ [ i ] is not None :
585
584
names .pop (self .drop_idx_ [i ])
586
585
feature_names .extend (names )
587
586
0 commit comments