@@ -73,6 +73,7 @@ def __init__(self,
73
73
min_weight_fraction_leaf ,
74
74
max_features ,
75
75
max_leaf_nodes ,
76
+ categorical_features ,
76
77
random_state ):
77
78
self .criterion = criterion
78
79
self .splitter = splitter
@@ -83,11 +84,13 @@ def __init__(self,
83
84
self .max_features = max_features
84
85
self .random_state = random_state
85
86
self .max_leaf_nodes = max_leaf_nodes
87
+ self .categorical_features = categorical_features
86
88
87
89
self .n_features_ = None
88
90
self .n_outputs_ = None
89
91
self .classes_ = None
90
92
self .n_classes_ = None
93
+ self .categorical_dicts = None
91
94
92
95
self .tree_ = None
93
96
self .max_features_ = None
@@ -237,6 +240,43 @@ def fit(self, X, y, sample_mask=None, X_argsorted=None, check_input=True,
237
240
"number of samples=%d" %
238
241
(len (sample_weight ), n_samples ))
239
242
243
+ n_features = X .shape [1 ]
244
+ # We parse the argument categorical_features to a mask
245
+ if not self .categorical_features or self .categorical_features == "None" :
246
+ categorical_mask = np .zeros (n_features , dtype = bool )
247
+ has_categorical = True
248
+ elif self .categorical_features == "all" :
249
+ categorical_mask = np .ones (n_features , dtype = bool )
250
+ has_categorical = True
251
+ else :
252
+ try :
253
+ self .categorical_features = list (self .categorical_features )
254
+ except TypeError :
255
+ raise ValueError ("categorical_features not recognized. Must "
256
+ "be 'None', 'all', a mask or a list" )
257
+ if len (self .categorical_features ) == n_features :
258
+ categorical_mask = self .categorical_features
259
+ has_categorical = sum (self .categorical_features ) > 0
260
+ else :
261
+ categorical_mask = np .zeros (n_features , dtype = bool )
262
+ categorical_mask [np .asarray (self .categorical_features )] = True
263
+ has_categorical = len (categorical_mask ) > 0
264
+ # We transform the categorical features to 0...n
265
+ self .categorical_dicts = [
266
+ dict ((e , i ) for (i , e ) in enumerate (set (X [:, feature ])))
267
+ if categorical_mask [feature ] else None
268
+ for feature in xrange (n_features ) ]
269
+ if has_categorical :
270
+ X = np .copy (X )
271
+ for feature in xrange (n_features ):
272
+ if categorical_mask [feature ]:
273
+ hashing = self .categorical_dicts [feature ]
274
+ if len (hashing ) > 32 :
275
+ raise ValueError (
276
+ "Too many factors for feature {}. 32 maximum, "
277
+ "found {}" .format (feature , len (hashing )))
278
+ X [:, feature ] = [hashing [e ] for e in X [:, feature ]]
279
+
240
280
# Set min_weight_leaf from min_weight_fraction_leaf
241
281
if self .min_weight_fraction_leaf != 0. and sample_weight is not None :
242
282
min_weight_leaf = (self .min_weight_fraction_leaf *
@@ -418,6 +458,13 @@ class DecisionTreeClassifier(BaseDecisionTree, ClassifierMixin):
418
458
If None then unlimited number of leaf nodes.
419
459
If not None then ``max_depth`` will be ignored.
420
460
461
+ categorical_features: array of indices or mask
462
+ Specify what features are treated as categorical.
463
+ - 'None' (default): All features are treated as not categorical.
464
+ - 'all': All features are treated as categorical.
465
+ - array of indices: Array of categorical feature indices.
466
+ - mask: Array of length n_features and with dtype=bool.
467
+
421
468
random_state : int, RandomState instance or None, optional (default=None)
422
469
If int, random_state is the seed used by the random number generator;
423
470
If RandomState instance, random_state is the random number generator;
@@ -489,7 +536,8 @@ def __init__(self,
489
536
random_state = None ,
490
537
min_density = None ,
491
538
compute_importances = None ,
492
- max_leaf_nodes = None ):
539
+ max_leaf_nodes = None ,
540
+ categorical_features = None ):
493
541
super (DecisionTreeClassifier , self ).__init__ (
494
542
criterion = criterion ,
495
543
splitter = splitter ,
@@ -499,7 +547,8 @@ def __init__(self,
499
547
min_weight_fraction_leaf = min_weight_fraction_leaf ,
500
548
max_features = max_features ,
501
549
max_leaf_nodes = max_leaf_nodes ,
502
- random_state = random_state )
550
+ random_state = random_state ,
551
+ categorical_features = categorical_features )
503
552
504
553
if min_density is not None :
505
554
warn ("The min_density parameter is deprecated as of version 0.14 "
@@ -641,6 +690,13 @@ class DecisionTreeRegressor(BaseDecisionTree, RegressorMixin):
641
690
If None then unlimited number of leaf nodes.
642
691
If not None then ``max_depth`` will be ignored.
643
692
693
+ categorical_features: array of indices or mask
694
+ Specify what features are treated as categorical.
695
+ - 'None' (default): All features are treated as not categorical.
696
+ - 'all': All features are treated as categorical.
697
+ - array of indices: Array of categorical feature indices.
698
+ - mask: Array of length n_features and with dtype=bool.
699
+
644
700
random_state : int, RandomState instance or None, optional (default=None)
645
701
If int, random_state is the seed used by the random number generator;
646
702
If RandomState instance, random_state is the random number generator;
@@ -704,7 +760,8 @@ def __init__(self,
704
760
random_state = None ,
705
761
min_density = None ,
706
762
compute_importances = None ,
707
- max_leaf_nodes = None ):
763
+ max_leaf_nodes = None ,
764
+ categorical_features = None ):
708
765
super (DecisionTreeRegressor , self ).__init__ (
709
766
criterion = criterion ,
710
767
splitter = splitter ,
@@ -714,7 +771,8 @@ def __init__(self,
714
771
min_weight_fraction_leaf = min_weight_fraction_leaf ,
715
772
max_features = max_features ,
716
773
max_leaf_nodes = max_leaf_nodes ,
717
- random_state = random_state )
774
+ random_state = random_state ,
775
+ categorical_features = categorical_features )
718
776
719
777
if min_density is not None :
720
778
warn ("The min_density parameter is deprecated as of version 0.14 "
0 commit comments