From 09f02c09b24ca3d06e76f5972268ae9e1ae57618 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 6 Oct 2020 22:54:05 -0400 Subject: [PATCH 01/53] implemented perm imp in oob_score for class and reg --- sklearn/ensemble/_forest.py | 89 +++++++++++++++++++++++++++++++------ 1 file changed, 75 insertions(+), 14 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 81fc319fdfadb..ef1aba40a7c02 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -395,7 +395,7 @@ def fit(self, X, y, sample_weight=None): # Collect newly grown trees self.estimators_.extend(trees) - if self.oob_score: + if self.oob_score or (self.importance_type == 'permutation'): self._set_oob_score(X, y) # Decapsulate classes_ attributes @@ -444,6 +444,9 @@ def feature_importances_(self): """ check_is_fitted(self) + if self.importance_type == 'permutation': + return self._permutation_importance + all_importances = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))( delayed(getattr)(tree, 'feature_importances_') @@ -493,7 +496,8 @@ def __init__(self, verbose=0, warm_start=False, class_weight=None, - max_samples=None): + max_samples=None, + importance_type='impurity'): super().__init__( base_estimator, n_estimators=n_estimators, @@ -524,18 +528,38 @@ def _set_oob_score(self, X, y): n_samples, self.max_samples ) - for estimator in self.estimators_: + all_imp = np.zeros((self.n_estimators, X.shape[1])) + for i, estimator in enumerate(self.estimators_): unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X[unsampled_indices, :], - check_input=False) + check_input=False) if self.n_outputs_ == 1: p_estimator = [p_estimator] + if self.importance_type == 'permutation': + baseline = 0 + for k in range(self.n_outputs_): + baseline += np.mean(y[unsampled_indices, k] == np.argmax(p_estimator[k], axis=1), axis=0) / self.n_outputs_ + for col in range(X.shape[1]): + X_permuted = X.copy() + np.random.shuffle(X_permuted[:, col]) + p_permuted = estimator.predict(X_permuted[unsampled_indices, :], + check_input=False) + if self.n_outputs_ == 1: + p_permuted = p_permuted[:, np.newaxis] + curr_acc = 0 + for k in range(self.n_outputs_): + curr_acc += np.mean(y[unsampled_indices, k] == p_permuted[:, k], axis=0) / self.n_outputs_ + all_imp[i, col] = baseline - curr_acc + for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] + if self.importance_type == 'permutation': + self._permutation_importance = all_imp.mean(axis=0) + for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " @@ -747,7 +771,8 @@ def __init__(self, random_state=None, verbose=0, warm_start=False, - max_samples=None): + max_samples=None, + importance_type='impurity'): super().__init__( base_estimator, n_estimators=n_estimators, @@ -817,18 +842,42 @@ def _set_oob_score(self, X, y): n_samples, self.max_samples ) - for estimator in self.estimators_: + all_imp = np.zeros((self.n_estimators, X.shape[1])) + for i, estimator in enumerate(self.estimators_): unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict( X[unsampled_indices, :], check_input=False) + if self.n_outputs_ == 1: p_estimator = p_estimator[:, np.newaxis] + if self.importance_type == 'permutation': + baseline = 0 + for k in range(self.n_outputs_): + baseline += np.mean((y[unsampled_indices, k] - p_estimator[:, k]) ** 2) / self.n_outputs_ + # baseline += r2_score(y[unsampled_indices, k], p_estimator[:, k]) / self.n_outputs_ + for col in range(X.shape[1]): + X_permuted = X.copy() + np.random.shuffle(X_permuted[:, col]) + p_permuted = estimator.predict(X_permuted[unsampled_indices, :], + check_input=False) + if self.n_outputs_ == 1: + p_permuted = p_permuted[:, np.newaxis] + curr_acc = 0 + for k in range(self.n_outputs_): + curr_acc += np.mean((y[unsampled_indices, k] - p_permuted[:, k]) ** 2) / self.n_outputs_ + # curr_acc += r2_score(y[unsampled_indices, k], p_permuted[:, k]) / self.n_outputs_ + all_imp[i, col] = curr_acc - baseline + # all_imp[i, col] = baseline - curr_acc + predictions[unsampled_indices, :] += p_estimator n_predictions[unsampled_indices, :] += 1 + if self.importance_type == 'permutation': + self._permutation_importance = all_imp.mean(axis=0) + if (n_predictions == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " @@ -1170,7 +1219,8 @@ def __init__(self, warm_start=False, class_weight=None, ccp_alpha=0.0, - max_samples=None): + max_samples=None, + importance_type='impurity'): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -1186,7 +1236,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, class_weight=class_weight, - max_samples=max_samples) + max_samples=max_samples, + importance_type=importance_type) self.criterion = criterion self.max_depth = max_depth @@ -1198,6 +1249,7 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.ccp_alpha = ccp_alpha + self.importance_type = importance_type class RandomForestRegressor(ForestRegressor): @@ -1460,7 +1512,8 @@ def __init__(self, verbose=0, warm_start=False, ccp_alpha=0.0, - max_samples=None): + max_samples=None, + importance_type='impurity'): super().__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, @@ -1475,7 +1528,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - max_samples=max_samples) + max_samples=max_samples, + importance_type=importance_type) self.criterion = criterion self.max_depth = max_depth @@ -1487,6 +1541,7 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.ccp_alpha = ccp_alpha + self.importance_type = importance_type class ExtraTreesClassifier(ForestClassifier): @@ -1772,7 +1827,8 @@ def __init__(self, warm_start=False, class_weight=None, ccp_alpha=0.0, - max_samples=None): + max_samples=None, + importance_type='impurity'): super().__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, @@ -1788,7 +1844,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, class_weight=class_weight, - max_samples=max_samples) + max_samples=max_samples, + importance_type=importance_type) self.criterion = criterion self.max_depth = max_depth @@ -1800,6 +1857,7 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.ccp_alpha = ccp_alpha + self.importance_type = importance_type class ExtraTreesRegressor(ForestRegressor): @@ -2052,7 +2110,8 @@ def __init__(self, verbose=0, warm_start=False, ccp_alpha=0.0, - max_samples=None): + max_samples=None, + importance_type='impurity'): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, @@ -2067,7 +2126,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - max_samples=max_samples) + max_samples=max_samples, + importance_type=importance_type) self.criterion = criterion self.max_depth = max_depth @@ -2079,6 +2139,7 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.ccp_alpha = ccp_alpha + self.importance_type = importance_type class RandomTreesEmbedding(BaseForest): From a4984a956e1cf9fa7fdbae23cb13d7a55b93b44c Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 6 Oct 2020 23:32:58 -0400 Subject: [PATCH 02/53] Fixed RandomTreesEmbedding compatability error --- sklearn/ensemble/_forest.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index ef1aba40a7c02..45581a0ffdf49 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -193,7 +193,8 @@ def __init__(self, verbose=0, warm_start=False, class_weight=None, - max_samples=None): + max_samples=None, + importance_type='impurity'): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -509,7 +510,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, class_weight=class_weight, - max_samples=max_samples) + max_samples=max_samples, + importance_type=importance_type) def _set_oob_score(self, X, y): """ @@ -783,7 +785,8 @@ def __init__(self, random_state=random_state, verbose=verbose, warm_start=warm_start, - max_samples=max_samples) + max_samples=max_samples, + importance_type=importance_type) def predict(self, X): """ @@ -2346,6 +2349,7 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.sparse_output = sparse_output + self.importance_type = 'impurity' def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by tree embedding") From e46d3e27ef15402ac1f2f6295751d1b8d9c0c489 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Thu, 8 Oct 2020 16:12:47 -0400 Subject: [PATCH 03/53] Put permutation importance in its own method --- sklearn/ensemble/_forest.py | 160 +++++++++++++++++++++++++----------- 1 file changed, 111 insertions(+), 49 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 45581a0ffdf49..7e4cd7dcede02 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -396,9 +396,12 @@ def fit(self, X, y, sample_weight=None): # Collect newly grown trees self.estimators_.extend(trees) - if self.oob_score or (self.importance_type == 'permutation'): + if self.oob_score: self._set_oob_score(X, y) + if self.importance_type == "permutation": + self._set_oob_permutation_importance(X, y) + # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: self.n_classes_ = self.n_classes_[0] @@ -406,6 +409,11 @@ def fit(self, X, y, sample_weight=None): return self + @abstractmethod + def _set_oob_permutation_importance(self, X, y): + """ + Calculate out of bag predictions and score.""" + @abstractmethod def _set_oob_score(self, X, y): """ @@ -446,7 +454,7 @@ def feature_importances_(self): check_is_fitted(self) if self.importance_type == 'permutation': - return self._permutation_importance + return self._oob_permutation_importance all_importances = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))( @@ -513,6 +521,53 @@ def __init__(self, max_samples=max_samples, importance_type=importance_type) + def _set_oob_permutation_importance(self, X, y): + """ + Compute out-of-bag permutation importance.""" + X = check_array(X, dtype=DTYPE, accept_sparse='csr') + + n_samples = y.shape[0] + + n_predictions = np.zeros(n_samples) + + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples, self.max_samples + ) + + all_imp = np.zeros((self.n_estimators, X.shape[1])) + for i, estimator in enumerate(self.estimators_): + unsampled_indices = _generate_unsampled_indices( + estimator.random_state, n_samples, n_samples_bootstrap) + p_estimator = estimator.predict(X[unsampled_indices, :], + check_input=False) + + if self.n_outputs_ == 1: + p_estimator = p_estimator[:, np.newaxis] + + baseline = 0 + for k in range(self.n_outputs_): + baseline += np.mean(y[unsampled_indices, k] == p_estimator[:, k], axis=0) / self.n_outputs_ + for col in range(X.shape[1]): + X_permuted = X.copy() + np.random.shuffle(X_permuted[:, col]) + p_permuted = estimator.predict(X_permuted[unsampled_indices, :], + check_input=False) + if self.n_outputs_ == 1: + p_permuted = p_permuted[:, np.newaxis] + curr_acc = 0 + for k in range(self.n_outputs_): + curr_acc += np.mean(y[unsampled_indices, k] == p_permuted[:, k], axis=0) / self.n_outputs_ + all_imp[i, col] = baseline - curr_acc + + n_predictions[unsampled_indices] += 1 + + self._oob_permutation_importance = all_imp.mean(axis=0) + + if (n_predictions == 0).any(): + warn("Some inputs do not have OOB scores. " + "This probably means too few trees were used " + "to compute any reliable oob estimates.") + def _set_oob_score(self, X, y): """ Compute out-of-bag score.""" @@ -530,38 +585,18 @@ def _set_oob_score(self, X, y): n_samples, self.max_samples ) - all_imp = np.zeros((self.n_estimators, X.shape[1])) - for i, estimator in enumerate(self.estimators_): + for estimator in self.estimators_: unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X[unsampled_indices, :], - check_input=False) + check_input=False) if self.n_outputs_ == 1: p_estimator = [p_estimator] - if self.importance_type == 'permutation': - baseline = 0 - for k in range(self.n_outputs_): - baseline += np.mean(y[unsampled_indices, k] == np.argmax(p_estimator[k], axis=1), axis=0) / self.n_outputs_ - for col in range(X.shape[1]): - X_permuted = X.copy() - np.random.shuffle(X_permuted[:, col]) - p_permuted = estimator.predict(X_permuted[unsampled_indices, :], - check_input=False) - if self.n_outputs_ == 1: - p_permuted = p_permuted[:, np.newaxis] - curr_acc = 0 - for k in range(self.n_outputs_): - curr_acc += np.mean(y[unsampled_indices, k] == p_permuted[:, k], axis=0) / self.n_outputs_ - all_imp[i, col] = baseline - curr_acc - for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] - if self.importance_type == 'permutation': - self._permutation_importance = all_imp.mean(axis=0) - for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " @@ -831,15 +866,14 @@ def predict(self, X): return y_hat - def _set_oob_score(self, X, y): + def _set_oob_permutation_importance(self, X, y): """ - Compute out-of-bag scores.""" + Compute out-of-bag permutation importances.""" X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_samples = y.shape[0] - predictions = np.zeros((n_samples, self.n_outputs_)) - n_predictions = np.zeros((n_samples, self.n_outputs_)) + n_predictions = np.zeros(n_samples) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, self.max_samples @@ -852,35 +886,59 @@ def _set_oob_score(self, X, y): p_estimator = estimator.predict( X[unsampled_indices, :], check_input=False) - if self.n_outputs_ == 1: p_estimator = p_estimator[:, np.newaxis] - if self.importance_type == 'permutation': - baseline = 0 + baseline = 0 + for k in range(self.n_outputs_): + baseline += r2_score(y[unsampled_indices, k], p_estimator[:, k]) / self.n_outputs_ + for col in range(X.shape[1]): + X_permuted = X.copy() + np.random.shuffle(X_permuted[:, col]) + p_permuted = estimator.predict(X_permuted[unsampled_indices, :], + check_input=False) + if self.n_outputs_ == 1: + p_permuted = p_permuted[:, np.newaxis] + curr_acc = 0 for k in range(self.n_outputs_): - baseline += np.mean((y[unsampled_indices, k] - p_estimator[:, k]) ** 2) / self.n_outputs_ - # baseline += r2_score(y[unsampled_indices, k], p_estimator[:, k]) / self.n_outputs_ - for col in range(X.shape[1]): - X_permuted = X.copy() - np.random.shuffle(X_permuted[:, col]) - p_permuted = estimator.predict(X_permuted[unsampled_indices, :], - check_input=False) - if self.n_outputs_ == 1: - p_permuted = p_permuted[:, np.newaxis] - curr_acc = 0 - for k in range(self.n_outputs_): - curr_acc += np.mean((y[unsampled_indices, k] - p_permuted[:, k]) ** 2) / self.n_outputs_ - # curr_acc += r2_score(y[unsampled_indices, k], p_permuted[:, k]) / self.n_outputs_ - all_imp[i, col] = curr_acc - baseline - # all_imp[i, col] = baseline - curr_acc + curr_acc += r2_score(y[unsampled_indices, k], p_permuted[:, k]) / self.n_outputs_ + all_imp[i, col] = baseline - curr_acc + + n_predictions[unsampled_indices] += 1 + + self._oob_permutation_importance = all_imp.mean(axis=0) + + if (n_predictions == 0).any(): + warn("Some inputs do not have OOB scores. " + "This probably means too few trees were used " + "to compute any reliable oob estimates.") + + def _set_oob_score(self, X, y): + """ + Compute out-of-bag scores.""" + X = check_array(X, dtype=DTYPE, accept_sparse='csr') + + n_samples = y.shape[0] + + predictions = np.zeros((n_samples, self.n_outputs_)) + n_predictions = np.zeros((n_samples, self.n_outputs_)) + + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples, self.max_samples + ) + + for estimator in self.estimators_: + unsampled_indices = _generate_unsampled_indices( + estimator.random_state, n_samples, n_samples_bootstrap) + p_estimator = estimator.predict( + X[unsampled_indices, :], check_input=False) + + if self.n_outputs_ == 1: + p_estimator = p_estimator[:, np.newaxis] predictions[unsampled_indices, :] += p_estimator n_predictions[unsampled_indices, :] += 1 - if self.importance_type == 'permutation': - self._permutation_importance = all_imp.mean(axis=0) - if (n_predictions == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " @@ -2351,6 +2409,10 @@ def __init__(self, self.sparse_output = sparse_output self.importance_type = 'impurity' + + def _set_oob_permutation_importance(self, X, y): + raise NotImplementedError("OOB permutation importance not supported by tree embedding") + def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by tree embedding") From 965e02a531cdf870ecd96f158b631d57e058deaa Mon Sep 17 00:00:00 2001 From: robert-robison Date: Fri, 9 Oct 2020 11:48:53 -0400 Subject: [PATCH 04/53] added tests and fixed random_state and formatting --- sklearn/ensemble/_forest.py | 86 +++++++++++++++++++-------- sklearn/ensemble/tests/test_forest.py | 85 ++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 26 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 7e4cd7dcede02..0e1bb2d4b89f4 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -353,6 +353,10 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Out of bag estimation only available" " if bootstrap=True") + if not self.bootstrap and (self.importance_type == 'permutation'): + raise ValueError("Out of bag estimation only available" + " if bootstrap=True") + random_state = check_random_state(self.random_state) if not self.warm_start or not hasattr(self, "estimators_"): @@ -524,7 +528,7 @@ def __init__(self, def _set_oob_permutation_importance(self, X, y): """ Compute out-of-bag permutation importance.""" - X = check_array(X, dtype=DTYPE, accept_sparse='csr') + X = check_array(X, dtype=DTYPE, accept_sparse=False) n_samples = y.shape[0] @@ -536,37 +540,54 @@ def _set_oob_permutation_importance(self, X, y): all_imp = np.zeros((self.n_estimators, X.shape[1])) for i, estimator in enumerate(self.estimators_): + random_state = check_random_state(estimator.random_state) + unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples, n_samples_bootstrap) - p_estimator = estimator.predict(X[unsampled_indices, :], - check_input=False) + estimator.random_state, n_samples, n_samples_bootstrap + ) + p_estimator = estimator.predict( + X[unsampled_indices, :], check_input=False + ) if self.n_outputs_ == 1: p_estimator = p_estimator[:, np.newaxis] baseline = 0 for k in range(self.n_outputs_): - baseline += np.mean(y[unsampled_indices, k] == p_estimator[:, k], axis=0) / self.n_outputs_ + baseline += ( + np.mean( + y[unsampled_indices, k] == p_estimator[:, k], axis=0 + ) + / self.n_outputs_ + ) for col in range(X.shape[1]): X_permuted = X.copy() - np.random.shuffle(X_permuted[:, col]) - p_permuted = estimator.predict(X_permuted[unsampled_indices, :], - check_input=False) + random_state.shuffle(X_permuted[:, col]) + p_permuted = estimator.predict( + X_permuted[unsampled_indices, :], check_input=False + ) if self.n_outputs_ == 1: p_permuted = p_permuted[:, np.newaxis] curr_acc = 0 for k in range(self.n_outputs_): - curr_acc += np.mean(y[unsampled_indices, k] == p_permuted[:, k], axis=0) / self.n_outputs_ + curr_acc += ( + np.mean( + y[unsampled_indices, k] == p_permuted[:, k], axis=0 + ) + / self.n_outputs_ + ) all_imp[i, col] = baseline - curr_acc n_predictions[unsampled_indices] += 1 - self._oob_permutation_importance = all_imp.mean(axis=0) + self._oob_permutation_importance = all_imp.mean(axis=0) if (n_predictions == 0).any(): - warn("Some inputs do not have OOB scores. " - "This probably means too few trees were used " - "to compute any reliable oob estimates.") + warn( + "Some inputs do not have OOB scores. " + "This probably means too few trees were used " + "to compute any reliable oob estimates." + ) def _set_oob_score(self, X, y): """ @@ -869,7 +890,7 @@ def predict(self, X): def _set_oob_permutation_importance(self, X, y): """ Compute out-of-bag permutation importances.""" - X = check_array(X, dtype=DTYPE, accept_sparse='csr') + X = check_array(X, dtype=DTYPE, accept_sparse=False) n_samples = y.shape[0] @@ -881,27 +902,38 @@ def _set_oob_permutation_importance(self, X, y): all_imp = np.zeros((self.n_estimators, X.shape[1])) for i, estimator in enumerate(self.estimators_): + random_state = check_random_state(estimator.random_state) + unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples, n_samples_bootstrap) + estimator.random_state, n_samples, n_samples_bootstrap + ) p_estimator = estimator.predict( - X[unsampled_indices, :], check_input=False) + X[unsampled_indices, :], check_input=False + ) if self.n_outputs_ == 1: p_estimator = p_estimator[:, np.newaxis] baseline = 0 for k in range(self.n_outputs_): - baseline += r2_score(y[unsampled_indices, k], p_estimator[:, k]) / self.n_outputs_ + baseline += ( + r2_score(y[unsampled_indices, k], p_estimator[:, k]) + / self.n_outputs_ + ) for col in range(X.shape[1]): X_permuted = X.copy() - np.random.shuffle(X_permuted[:, col]) - p_permuted = estimator.predict(X_permuted[unsampled_indices, :], - check_input=False) + random_state.shuffle(X_permuted[:, col]) + p_permuted = estimator.predict( + X_permuted[unsampled_indices, :], check_input=False + ) if self.n_outputs_ == 1: p_permuted = p_permuted[:, np.newaxis] curr_acc = 0 for k in range(self.n_outputs_): - curr_acc += r2_score(y[unsampled_indices, k], p_permuted[:, k]) / self.n_outputs_ + curr_acc += ( + r2_score(y[unsampled_indices, k], p_permuted[:, k]) + / self.n_outputs_ + ) all_imp[i, col] = baseline - curr_acc n_predictions[unsampled_indices] += 1 @@ -909,9 +941,11 @@ def _set_oob_permutation_importance(self, X, y): self._oob_permutation_importance = all_imp.mean(axis=0) if (n_predictions == 0).any(): - warn("Some inputs do not have OOB scores. " - "This probably means too few trees were used " - "to compute any reliable oob estimates.") + warn( + "Some inputs do not have OOB scores. " + "This probably means too few trees were used " + "to compute any reliable oob estimates." + ) def _set_oob_score(self, X, y): """ @@ -2409,9 +2443,9 @@ def __init__(self, self.sparse_output = sparse_output self.importance_type = 'impurity' - def _set_oob_permutation_importance(self, X, y): - raise NotImplementedError("OOB permutation importance not supported by tree embedding") + raise NotImplementedError("OOB permutation importance not supported " + "by tree embedding") def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by tree embedding") diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 65350f4d602d9..088d452530171 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -64,6 +64,11 @@ n_samples=500, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) +# Larger multiclass classification sample used for testing feature importances +X_large_multiclass, y_large_multiclass = datasets.make_classification( + n_samples=500, n_features=10, n_informative=3, n_redundant=0, + n_repeated=0, n_classes=3, shuffle=False, random_state=42) + # also load the iris dataset # and randomly permute it iris = datasets.load_iris() @@ -371,6 +376,86 @@ def test_unfitted_feature_importances(name): getattr(FOREST_ESTIMATORS[name](), 'feature_importances_') +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("dtype", (np.float64, np.float32)) +def test_oob_importances(name, dtype): + + X = X_large.astype(dtype=dtype, copy=False) + y = y_large.astype(dtype=dtype, copy=False) + + ForestEstimator = FOREST_ESTIMATORS[name] + + clf = ForestEstimator( + n_estimators=10, + random_state=0, + importance_type="permutation", + bootstrap=True, + ) + clf.fit(X, y) + importances = clf.feature_importances_ + if name in FOREST_CLASSIFIERS: + imp_level = 0.025 + else: + imp_level = 0.1 + + n_important = np.sum(importances > imp_level) + assert importances.shape[0] == 10 + assert n_important == 3 + assert np.all(importances[:3] > imp_level) + + +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("dtype", (np.float64, np.float32)) +def test_oob_importances_multi_class(name, dtype): + + X = X_large_multiclass.astype(dtype=dtype, copy=False) + y = y_large_multiclass.astype(dtype=dtype, copy=False) + + ForestEstimator = FOREST_ESTIMATORS[name] + + clf = ForestEstimator( + n_estimators=10, + random_state=0, + importance_type="permutation", + bootstrap=True, + ) + clf.fit(X, y) + importances = clf.feature_importances_ + + n_important = np.sum(importances > 0.05) + assert importances.shape[0] == 10 + assert n_important == 3 + assert np.all(importances[:3] > 0.05) + + +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +def test_oob_importances_raise_error(name): + ForestEstimator = FOREST_ESTIMATORS[name] + + if name in FOREST_TRANSFORMERS: + assert_raises( + TypeError, ForestEstimator, importance_type="permutation" + ) + + assert_raises( + NotImplementedError, + ForestEstimator()._set_oob_permutation_importance, + X, + y, + ) + + else: + # No bootstrap + assert_raises( + ValueError, + ForestEstimator( + importance_type="permutation", bootstrap=False + ).fit, + X, + y, + ) + + def check_oob_score(name, X, y, n_estimators=20): # Check that oob prediction is a good estimation of the generalization # error. From 87fbeaedcc692724de138aea9329e681f5e095bd Mon Sep 17 00:00:00 2001 From: robert-robison Date: Sun, 10 Jan 2021 18:20:00 -0500 Subject: [PATCH 05/53] Add scorer, update example, changed param name --- .../inspection/plot_permutation_importance.py | 30 ++ sklearn/ensemble/_forest.py | 312 +++++++++--------- sklearn/ensemble/tests/test_forest.py | 8 +- 3 files changed, 183 insertions(+), 167 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index d708aa0fd6756..e87730ee4356d 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -176,3 +176,33 @@ ax.set_title("Permutation Importances (train set)") fig.tight_layout() plt.show() + +# %% +# Finally, the permutation importance can also be calculated using the +# out-of-bag data by setting ``feature_importances_type="permutation"`` and +# re-running the pipeline. This confirms that ``sex``` is most important +# and that the random features have low importances. +rf = Pipeline([ + ('preprocess', preprocessing), + ('classifier', RandomForestClassifier(random_state=42, feature_importances_type="permutation")) +]) +rf.fit(X_train, y_train) + +ohe = (rf.named_steps['preprocess'] + .named_transformers_['cat'] + .named_steps['onehot']) +feature_names = ohe.get_feature_names(input_features=categorical_columns) +feature_names = np.r_[feature_names, numerical_columns] + +tree_feature_importances = ( + rf.named_steps['classifier'].feature_importances_) +sorted_idx = tree_feature_importances.argsort() + +y_ticks = np.arange(0, len(feature_names)) +fig, ax = plt.subplots() +ax.barh(y_ticks, tree_feature_importances[sorted_idx]) +ax.set_yticklabels(feature_names[sorted_idx]) +ax.set_yticks(y_ticks) +ax.set_title("Random Forest Feature Importances (MDI)") +fig.tight_layout() +plt.show() diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 0e1bb2d4b89f4..d5832945851b5 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -51,7 +51,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from joblib import Parallel from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin -from ..metrics import r2_score +from ..metrics import check_scoring, r2_score from ..preprocessing import OneHotEncoder from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor) @@ -194,7 +194,8 @@ def __init__(self, warm_start=False, class_weight=None, max_samples=None, - importance_type='impurity'): + feature_importances_type='impurity', + scoring=None): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -208,6 +209,8 @@ def __init__(self, self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples + self.feature_importances_type = feature_importances_type + self.scoring = scoring def apply(self, X): """ @@ -353,7 +356,7 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Out of bag estimation only available" " if bootstrap=True") - if not self.bootstrap and (self.importance_type == 'permutation'): + if not self.bootstrap and (self.feature_importances_type == 'permutation'): raise ValueError("Out of bag estimation only available" " if bootstrap=True") @@ -403,7 +406,7 @@ def fit(self, X, y, sample_weight=None): if self.oob_score: self._set_oob_score(X, y) - if self.importance_type == "permutation": + if self.feature_importances_type == "permutation": self._set_oob_permutation_importance(X, y) # Decapsulate classes_ attributes @@ -413,10 +416,46 @@ def fit(self, X, y, sample_weight=None): return self - @abstractmethod def _set_oob_permutation_importance(self, X, y): """ Calculate out of bag predictions and score.""" + X = check_array(X, dtype=DTYPE, accept_sparse=False) + + n_samples = y.shape[0] + + n_predictions = np.zeros(n_samples) + + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples, self.max_samples + ) + + all_imp = np.zeros((self.n_estimators, X.shape[1])) + for i, estimator in enumerate(self.estimators_): + random_state = check_random_state(estimator.random_state) + + unsampled_indices = _generate_unsampled_indices( + estimator.random_state, n_samples, n_samples_bootstrap + ) + + scorer = check_scoring(estimator, scoring=self.scoring) + baseline = scorer(estimator, X[unsampled_indices, :], y[unsampled_indices, :]) + + for col in range(X.shape[1]): + X_permuted = X.copy() + random_state.shuffle(X_permuted[:, col]) + curr_perf = scorer(estimator, X_permuted[unsampled_indices, :], y[unsampled_indices, :]) + all_imp[i, col] = baseline - curr_perf + + n_predictions[unsampled_indices] += 1 + + self._oob_permutation_importance = all_imp.mean(axis=0) + + if (n_predictions == 0).any(): + warn( + "Some inputs do not have OOB scores. " + "This probably means too few trees were used " + "to compute any reliable oob estimates." + ) @abstractmethod def _set_oob_score(self, X, y): @@ -457,7 +496,7 @@ def feature_importances_(self): """ check_is_fitted(self) - if self.importance_type == 'permutation': + if self.feature_importances_type == 'permutation': return self._oob_permutation_importance all_importances = Parallel(n_jobs=self.n_jobs, @@ -510,7 +549,8 @@ def __init__(self, warm_start=False, class_weight=None, max_samples=None, - importance_type='impurity'): + feature_importances_type='impurity', + scoring=None): super().__init__( base_estimator, n_estimators=n_estimators, @@ -523,71 +563,8 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - importance_type=importance_type) - - def _set_oob_permutation_importance(self, X, y): - """ - Compute out-of-bag permutation importance.""" - X = check_array(X, dtype=DTYPE, accept_sparse=False) - - n_samples = y.shape[0] - - n_predictions = np.zeros(n_samples) - - n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples, self.max_samples - ) - - all_imp = np.zeros((self.n_estimators, X.shape[1])) - for i, estimator in enumerate(self.estimators_): - random_state = check_random_state(estimator.random_state) - - unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples, n_samples_bootstrap - ) - p_estimator = estimator.predict( - X[unsampled_indices, :], check_input=False - ) - - if self.n_outputs_ == 1: - p_estimator = p_estimator[:, np.newaxis] - - baseline = 0 - for k in range(self.n_outputs_): - baseline += ( - np.mean( - y[unsampled_indices, k] == p_estimator[:, k], axis=0 - ) - / self.n_outputs_ - ) - for col in range(X.shape[1]): - X_permuted = X.copy() - random_state.shuffle(X_permuted[:, col]) - p_permuted = estimator.predict( - X_permuted[unsampled_indices, :], check_input=False - ) - if self.n_outputs_ == 1: - p_permuted = p_permuted[:, np.newaxis] - curr_acc = 0 - for k in range(self.n_outputs_): - curr_acc += ( - np.mean( - y[unsampled_indices, k] == p_permuted[:, k], axis=0 - ) - / self.n_outputs_ - ) - all_imp[i, col] = baseline - curr_acc - - n_predictions[unsampled_indices] += 1 - - self._oob_permutation_importance = all_imp.mean(axis=0) - - if (n_predictions == 0).any(): - warn( - "Some inputs do not have OOB scores. " - "This probably means too few trees were used " - "to compute any reliable oob estimates." - ) + feature_importances_type=feature_importances_type, + scoring=scoring) def _set_oob_score(self, X, y): """ @@ -830,7 +807,8 @@ def __init__(self, verbose=0, warm_start=False, max_samples=None, - importance_type='impurity'): + feature_importances_type='impurity', + scoring=None): super().__init__( base_estimator, n_estimators=n_estimators, @@ -842,7 +820,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - importance_type=importance_type) + feature_importances_type=feature_importances_type, + scoring=scoring) def predict(self, X): """ @@ -887,66 +866,6 @@ def predict(self, X): return y_hat - def _set_oob_permutation_importance(self, X, y): - """ - Compute out-of-bag permutation importances.""" - X = check_array(X, dtype=DTYPE, accept_sparse=False) - - n_samples = y.shape[0] - - n_predictions = np.zeros(n_samples) - - n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples, self.max_samples - ) - - all_imp = np.zeros((self.n_estimators, X.shape[1])) - for i, estimator in enumerate(self.estimators_): - random_state = check_random_state(estimator.random_state) - - unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples, n_samples_bootstrap - ) - p_estimator = estimator.predict( - X[unsampled_indices, :], check_input=False - ) - - if self.n_outputs_ == 1: - p_estimator = p_estimator[:, np.newaxis] - - baseline = 0 - for k in range(self.n_outputs_): - baseline += ( - r2_score(y[unsampled_indices, k], p_estimator[:, k]) - / self.n_outputs_ - ) - for col in range(X.shape[1]): - X_permuted = X.copy() - random_state.shuffle(X_permuted[:, col]) - p_permuted = estimator.predict( - X_permuted[unsampled_indices, :], check_input=False - ) - if self.n_outputs_ == 1: - p_permuted = p_permuted[:, np.newaxis] - curr_acc = 0 - for k in range(self.n_outputs_): - curr_acc += ( - r2_score(y[unsampled_indices, k], p_permuted[:, k]) - / self.n_outputs_ - ) - all_imp[i, col] = baseline - curr_acc - - n_predictions[unsampled_indices] += 1 - - self._oob_permutation_importance = all_imp.mean(axis=0) - - if (n_predictions == 0).any(): - warn( - "Some inputs do not have OOB scores. " - "This probably means too few trees were used " - "to compute any reliable oob estimates." - ) - def _set_oob_score(self, X, y): """ Compute out-of-bag scores.""" @@ -1213,6 +1132,23 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 + feature_importances_type : {"impurity", "permutation"}, default="impurity" + The type of feature importance to calculate: + + - If "impurity", then gets impurity-based feature importance. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + - If "permutation", then gets out-of-bag permutation importance. + The importance corresponds with the average decrease in r2_score + across all tree when a feature is permuted or shuffled. + + scoring : string, callable, or None, default=None + Model evaluation used if feature_importance_type = "permutation". + A string (see model evaluation documentation) or a scorer callable + object / function with signature `scorer(estimator, X, y)`. + Defaults to accuracy for classification and `r2_score` for regression. + Attributes ---------- base_estimator_ : DecisionTreeClassifier @@ -1239,12 +1175,11 @@ class labels (multi-output problem). feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. - The importance of a feature is computed as the (normalized) - total reduction of the criterion brought by that feature. It is also - known as the Gini importance. + See ``feature_importances_type`` for how these are calculated. Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). See + high cardinality features (many unique values). + Consider setting ``feature_importances_type="permutation"`` or using :func:`sklearn.inspection.permutation_importance` as an alternative. oob_score_ : float @@ -1315,7 +1250,8 @@ def __init__(self, class_weight=None, ccp_alpha=0.0, max_samples=None, - importance_type='impurity'): + feature_importances_type='impurity', + scoring=None): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -1332,7 +1268,8 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - importance_type=importance_type) + feature_importances_type=feature_importances_type, + scoring=scoring) self.criterion = criterion self.max_depth = max_depth @@ -1344,7 +1281,6 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.ccp_alpha = ccp_alpha - self.importance_type = importance_type class RandomForestRegressor(ForestRegressor): @@ -1512,6 +1448,23 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 + feature_importances_type : {"impurity", "permutation"}, default="impurity" + The type of feature importance to calculate: + + - If "impurity", then gets impurity-based feature importance. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + - If "permutation", then gets out-of-bag permutation importance. + The importance corresponds with the average decrease in r2_score + across all tree when a feature is permuted or shuffled. + + scoring : string, callable, or None, default=None + Model evaluation used if feature_importance_type = "permutation". + A string (see model evaluation documentation) or a scorer callable + object / function with signature `scorer(estimator, X, y)`. + Defaults to accuracy for classification and `r2_score` for regression. + Attributes ---------- base_estimator_ : DecisionTreeRegressor @@ -1524,12 +1477,11 @@ class RandomForestRegressor(ForestRegressor): feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. - The importance of a feature is computed as the (normalized) - total reduction of the criterion brought by that feature. It is also - known as the Gini importance. + See ``feature_importances_type`` for how these are calculated. Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). See + high cardinality features (many unique values). + Consider setting ``feature_importances_type="permutation"`` or using :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int @@ -1608,7 +1560,8 @@ def __init__(self, warm_start=False, ccp_alpha=0.0, max_samples=None, - importance_type='impurity'): + feature_importances_type='impurity', + scoring=None): super().__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, @@ -1624,7 +1577,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - importance_type=importance_type) + feature_importances_type=feature_importances_type, + scoring=scoring) self.criterion = criterion self.max_depth = max_depth @@ -1636,7 +1590,6 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.ccp_alpha = ccp_alpha - self.importance_type = importance_type class ExtraTreesClassifier(ForestClassifier): @@ -1827,6 +1780,23 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 + feature_importances_type : {"impurity", "permutation"}, default="impurity" + The type of feature importance to calculate: + + - If "impurity", then gets impurity-based feature importance. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + - If "permutation", then gets out-of-bag permutation importance. + The importance corresponds with the average decrease in r2_score + across all tree when a feature is permuted or shuffled. + + scoring : string, callable, or None, default=None + Model evaluation used if feature_importance_type = "permutation". + A string (see model evaluation documentation) or a scorer callable + object / function with signature `scorer(estimator, X, y)`. + Defaults to accuracy for classification and `r2_score` for regression. + Attributes ---------- base_estimator_ : ExtraTreesClassifier @@ -1847,12 +1817,11 @@ class labels (multi-output problem). feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. - The importance of a feature is computed as the (normalized) - total reduction of the criterion brought by that feature. It is also - known as the Gini importance. + See ``feature_importances_type`` for how these are calculated. Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). See + high cardinality features (many unique values). + Consider setting ``feature_importances_type="permutation"`` or using :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int @@ -1923,7 +1892,8 @@ def __init__(self, class_weight=None, ccp_alpha=0.0, max_samples=None, - importance_type='impurity'): + feature_importances_type='impurity', + scoring=None): super().__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, @@ -1940,7 +1910,8 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - importance_type=importance_type) + feature_importances_type=feature_importances_type, + scoring=scoring) self.criterion = criterion self.max_depth = max_depth @@ -1952,7 +1923,6 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.ccp_alpha = ccp_alpha - self.importance_type = importance_type class ExtraTreesRegressor(ForestRegressor): @@ -2121,6 +2091,23 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 + feature_importances_type : {"impurity", "permutation"}, default="impurity" + The type of feature importance to calculate: + + - If "impurity", then gets impurity-based feature importance. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance. + - If "permutation", then gets out-of-bag permutation importance. + The importance corresponds with the average decrease in performance + across all tree when a feature is permuted or shuffled. + + scoring : string, callable, or None, default=None + Model evaluation used if feature_importance_type = "permutation". + A string (see model evaluation documentation) or a scorer callable + object / function with signature `scorer(estimator, X, y)`. + Defaults to accuracy for classification and `r2_score` for regression. + Attributes ---------- base_estimator_ : ExtraTreeRegressor @@ -2133,12 +2120,11 @@ class ExtraTreesRegressor(ForestRegressor): feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. - The importance of a feature is computed as the (normalized) - total reduction of the criterion brought by that feature. It is also - known as the Gini importance. + See ``feature_importances_type`` for how these are calculated. Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). See + high cardinality features (many unique values). + Consider setting ``feature_importances_type="permutation"`` or using :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int @@ -2206,7 +2192,8 @@ def __init__(self, warm_start=False, ccp_alpha=0.0, max_samples=None, - importance_type='impurity'): + feature_importances_type='impurity', + scoring=None): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, @@ -2222,7 +2209,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - importance_type=importance_type) + feature_importances_type=feature_importances_type, + scoring=scoring) self.criterion = criterion self.max_depth = max_depth @@ -2234,7 +2222,6 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.ccp_alpha = ccp_alpha - self.importance_type = importance_type class RandomTreesEmbedding(BaseForest): @@ -2441,7 +2428,6 @@ def __init__(self, self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split self.sparse_output = sparse_output - self.importance_type = 'impurity' def _set_oob_permutation_importance(self, X, y): raise NotImplementedError("OOB permutation importance not supported " diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 088d452530171..5781c3fdb1517 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -388,7 +388,7 @@ def test_oob_importances(name, dtype): clf = ForestEstimator( n_estimators=10, random_state=0, - importance_type="permutation", + feature_importances_type="permutation", bootstrap=True, ) clf.fit(X, y) @@ -416,7 +416,7 @@ def test_oob_importances_multi_class(name, dtype): clf = ForestEstimator( n_estimators=10, random_state=0, - importance_type="permutation", + feature_importances_type="permutation", bootstrap=True, ) clf.fit(X, y) @@ -434,7 +434,7 @@ def test_oob_importances_raise_error(name): if name in FOREST_TRANSFORMERS: assert_raises( - TypeError, ForestEstimator, importance_type="permutation" + TypeError, ForestEstimator, feature_importances_type="permutation" ) assert_raises( @@ -449,7 +449,7 @@ def test_oob_importances_raise_error(name): assert_raises( ValueError, ForestEstimator( - importance_type="permutation", bootstrap=False + feature_importances_type="permutation", bootstrap=False ).fit, X, y, From 98a94a637bdba5cd6d04f5863af19b201626495c Mon Sep 17 00:00:00 2001 From: robert-robison Date: Sun, 10 Jan 2021 22:11:26 -0500 Subject: [PATCH 06/53] formatting --- .../inspection/plot_permutation_importance.py | 15 +++++++++++---- sklearn/ensemble/_forest.py | 14 +++++++++++--- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 0ac796d44674d..ce6af2a09e7c2 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -178,10 +178,17 @@ # out-of-bag data by setting ``feature_importances_type="permutation"`` and # re-running the pipeline. This confirms that ``sex``` is most important # and that the random features have low importances. -rf = Pipeline([ - ('preprocess', preprocessing), - ('classifier', RandomForestClassifier(random_state=42, feature_importances_type="permutation")) -]) +rf = Pipeline( + [ + ("preprocess", preprocessing), + ( + "classifier", + RandomForestClassifier( + random_state=42, feature_importances_type="permutation" + ), + ), + ] +) rf.fit(X_train, y_train) ohe = (rf.named_steps['preprocess'] diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 746ff16312c8f..2d34851e34177 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -356,7 +356,9 @@ def fit(self, X, y, sample_weight=None): raise ValueError("Out of bag estimation only available" " if bootstrap=True") - if not self.bootstrap and (self.feature_importances_type == 'permutation'): + if not self.bootstrap and ( + self.feature_importances_type == "permutation" + ): raise ValueError("Out of bag estimation only available" " if bootstrap=True") @@ -438,12 +440,18 @@ def _set_oob_permutation_importance(self, X, y): ) scorer = check_scoring(estimator, scoring=self.scoring) - baseline = scorer(estimator, X[unsampled_indices, :], y[unsampled_indices, :]) + baseline = scorer( + estimator, X[unsampled_indices, :], y[unsampled_indices, :] + ) for col in range(X.shape[1]): X_permuted = X.copy() random_state.shuffle(X_permuted[:, col]) - curr_perf = scorer(estimator, X_permuted[unsampled_indices, :], y[unsampled_indices, :]) + curr_perf = scorer( + estimator, + X_permuted[unsampled_indices, :], + y[unsampled_indices, :], + ) all_imp[i, col] = baseline - curr_perf n_predictions[unsampled_indices] += 1 From 93470c93c2da5de760d5095c7638a8317f3fd5e3 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Mon, 11 Jan 2021 18:21:16 -0500 Subject: [PATCH 07/53] fixed bug in example --- examples/inspection/plot_permutation_importance.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index ce6af2a09e7c2..d1641469a944a 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -192,8 +192,7 @@ rf.fit(X_train, y_train) ohe = (rf.named_steps['preprocess'] - .named_transformers_['cat'] - .named_steps['onehot']) + .named_transformers_['cat']) feature_names = ohe.get_feature_names(input_features=categorical_columns) feature_names = np.r_[feature_names, numerical_columns] @@ -206,6 +205,6 @@ ax.barh(y_ticks, tree_feature_importances[sorted_idx]) ax.set_yticklabels(feature_names[sorted_idx]) ax.set_yticks(y_ticks) -ax.set_title("Random Forest Feature Importances (MDI)") +ax.set_title("Random Forest Feature Importances (OOB Permutation)") fig.tight_layout() plt.show() From 4b1f394997aab78f3e0160f2851ab0c7b31c1a90 Mon Sep 17 00:00:00 2001 From: robert-robison <69172120+robert-robison@users.noreply.github.com> Date: Tue, 12 Jan 2021 10:35:54 -0500 Subject: [PATCH 08/53] Apply suggestions from code review Co-authored-by: Guillaume Lemaitre --- .../inspection/plot_permutation_importance.py | 16 +++++----------- sklearn/ensemble/_forest.py | 16 +++++++++------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index d1641469a944a..2a4605c99d6e7 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -178,17 +178,11 @@ # out-of-bag data by setting ``feature_importances_type="permutation"`` and # re-running the pipeline. This confirms that ``sex``` is most important # and that the random features have low importances. -rf = Pipeline( - [ - ("preprocess", preprocessing), - ( - "classifier", - RandomForestClassifier( - random_state=42, feature_importances_type="permutation" - ), - ), - ] -) +rf = Pipeline(steps=[ + ("preprocess", preprocessing), + ("classifier", RandomForestClassifier( + random_state=42, feature_importances_type="permutation)) +]) rf.fit(X_train, y_train) ohe = (rf.named_steps['preprocess'] diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 2d34851e34177..f4c75f03bbf1a 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -359,8 +359,9 @@ def fit(self, X, y, sample_weight=None): if not self.bootstrap and ( self.feature_importances_type == "permutation" ): - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + raise ValueError( + "Out of bag estimation only available if bootstrap=True" + ) random_state = check_random_state(self.random_state) @@ -419,8 +420,7 @@ def fit(self, X, y, sample_weight=None): return self def _set_oob_permutation_importance(self, X, y): - """ - Calculate out of bag predictions and score.""" + """Compute feature importances from the out-of-bag samples.""" X = check_array(X, dtype=DTYPE, accept_sparse=False) n_samples = y.shape[0] @@ -1141,15 +1141,17 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 feature_importances_type : {"impurity", "permutation"}, default="impurity" - The type of feature importance to calculate: + The type of feature importance to compute: - If "impurity", then gets impurity-based feature importance. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. - If "permutation", then gets out-of-bag permutation importance. - The importance corresponds with the average decrease in r2_score - across all tree when a feature is permuted or shuffled. + The importance corresponds with the average decrease in R2 + across all trees when a feature is permuted or shuffled. + + .. versionadded: 1.0 scoring : string, callable, or None, default=None Model evaluation used if feature_importance_type = "permutation". From dfeaf52fcc326a67b1cd2e63da33c8e2e7ed2e0a Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 12 Jan 2021 16:47:40 -0500 Subject: [PATCH 09/53] parallelized, removed scoring, fixed tests --- .../inspection/plot_permutation_importance.py | 2 +- sklearn/ensemble/_forest.py | 147 +++++++----------- sklearn/ensemble/tests/test_forest.py | 53 +++---- 3 files changed, 76 insertions(+), 126 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 2a4605c99d6e7..56303b2593344 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -181,7 +181,7 @@ rf = Pipeline(steps=[ ("preprocess", preprocessing), ("classifier", RandomForestClassifier( - random_state=42, feature_importances_type="permutation)) + random_state=42, feature_importances_type="permutation")) ]) rf.fit(X_train, y_train) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f4c75f03bbf1a..b5c35fb535d2c 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -51,12 +51,13 @@ class calls the ``fit`` method of each sub-estimator on random samples from joblib import Parallel from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin -from ..metrics import check_scoring, r2_score +from ..metrics import r2_score from ..preprocessing import OneHotEncoder from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, ExtraTreeClassifier, ExtraTreeRegressor) from ..tree._tree import DTYPE, DOUBLE from ..utils import check_random_state, check_array, compute_sample_weight +from ..utils import Bunch from ..exceptions import DataConversionWarning from ._base import BaseEnsemble, _partition_estimators from ..utils.fixes import delayed @@ -194,8 +195,7 @@ def __init__(self, warm_start=False, class_weight=None, max_samples=None, - feature_importances_type='impurity', - scoring=None): + feature_importances_type='impurity'): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -210,7 +210,6 @@ def __init__(self, self.class_weight = class_weight self.max_samples = max_samples self.feature_importances_type = feature_importances_type - self.scoring = scoring def apply(self, X): """ @@ -410,7 +409,7 @@ def fit(self, X, y, sample_weight=None): self._set_oob_score(X, y) if self.feature_importances_type == "permutation": - self._set_oob_permutation_importance(X, y) + self._set_oob_permutation_importance(X, y, sample_weight) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: @@ -419,51 +418,53 @@ def fit(self, X, y, sample_weight=None): return self - def _set_oob_permutation_importance(self, X, y): - """Compute feature importances from the out-of-bag samples.""" - X = check_array(X, dtype=DTYPE, accept_sparse=False) + def _get_tree_oob_performance(self, estimator, X, y, n_samples, + n_samples_bootstrap, sample_weight): + """Get out-of-bag performance for a single tree""" + random_state = check_random_state(estimator.random_state) - n_samples = y.shape[0] + unsampled_indices = _generate_unsampled_indices( + estimator.random_state, n_samples, n_samples_bootstrap + ) - n_predictions = np.zeros(n_samples) + from ..inspection._permutation_importance \ + import permutation_importance - n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples, self.max_samples + result = permutation_importance( + estimator, X[unsampled_indices, :], y[unsampled_indices, :], + n_repeats=1, n_jobs=self.n_jobs, random_state=random_state, + sample_weight=sample_weight[unsampled_indices] ) - all_imp = np.zeros((self.n_estimators, X.shape[1])) - for i, estimator in enumerate(self.estimators_): - random_state = check_random_state(estimator.random_state) + return result.importances_mean - unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples, n_samples_bootstrap - ) + def _set_oob_permutation_importance(self, X, y, sample_weight): + """Compute feature importances from the out-of-bag samples.""" + X = check_array(X, dtype=DTYPE, accept_sparse='csr') - scorer = check_scoring(estimator, scoring=self.scoring) - baseline = scorer( - estimator, X[unsampled_indices, :], y[unsampled_indices, :] - ) + n_samples = y.shape[0] - for col in range(X.shape[1]): - X_permuted = X.copy() - random_state.shuffle(X_permuted[:, col]) - curr_perf = scorer( - estimator, - X_permuted[unsampled_indices, :], - y[unsampled_indices, :], - ) - all_imp[i, col] = baseline - curr_perf + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples, self.max_samples + ) - n_predictions[unsampled_indices] += 1 + all_imp = np.array(Parallel(n_jobs=self.n_jobs)\ + (delayed(self._get_tree_oob_performance)( + estimator, X, y, n_samples, n_samples_bootstrap, sample_weight + ) for estimator in self.estimators_)) - self._oob_permutation_importance = all_imp.mean(axis=0) + self._oob_permutation_importance = Bunch( + importances_mean=np.mean(all_imp, axis=0), + importances_std=np.std(all_imp, axis=0), + importances=all_imp + ) - if (n_predictions == 0).any(): - warn( - "Some inputs do not have OOB scores. " - "This probably means too few trees were used " - "to compute any reliable oob estimates." - ) + # if (n_predictions == 0).any(): + # warn( + # "Some inputs do not have OOB scores. " + # "This probably means too few trees were used " + # "to compute any reliable oob estimates." + # ) @abstractmethod def _set_oob_score(self, X, y): @@ -505,7 +506,7 @@ def feature_importances_(self): check_is_fitted(self) if self.feature_importances_type == 'permutation': - return self._oob_permutation_importance + return self._oob_permutation_importance.importances_mean all_importances = Parallel(n_jobs=self.n_jobs, **_joblib_parallel_args(prefer='threads'))( @@ -557,8 +558,7 @@ def __init__(self, warm_start=False, class_weight=None, max_samples=None, - feature_importances_type='impurity', - scoring=None): + feature_importances_type='impurity'): super().__init__( base_estimator, n_estimators=n_estimators, @@ -571,8 +571,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - feature_importances_type=feature_importances_type, - scoring=scoring) + feature_importances_type=feature_importances_type) def _set_oob_score(self, X, y): """ @@ -815,8 +814,7 @@ def __init__(self, verbose=0, warm_start=False, max_samples=None, - feature_importances_type='impurity', - scoring=None): + feature_importances_type='impurity'): super().__init__( base_estimator, n_estimators=n_estimators, @@ -828,8 +826,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - feature_importances_type=feature_importances_type, - scoring=scoring) + feature_importances_type=feature_importances_type) def predict(self, X): """ @@ -1150,14 +1147,8 @@ class RandomForestClassifier(ForestClassifier): - If "permutation", then gets out-of-bag permutation importance. The importance corresponds with the average decrease in R2 across all trees when a feature is permuted or shuffled. - - .. versionadded: 1.0 - scoring : string, callable, or None, default=None - Model evaluation used if feature_importance_type = "permutation". - A string (see model evaluation documentation) or a scorer callable - object / function with signature `scorer(estimator, X, y)`. - Defaults to accuracy for classification and `r2_score` for regression. + .. versionadded: 1.0 Attributes ---------- @@ -1260,8 +1251,7 @@ def __init__(self, class_weight=None, ccp_alpha=0.0, max_samples=None, - feature_importances_type='impurity', - scoring=None): + feature_importances_type='impurity'): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -1278,8 +1268,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - feature_importances_type=feature_importances_type, - scoring=scoring) + feature_importances_type=feature_importances_type) self.criterion = criterion self.max_depth = max_depth @@ -1470,12 +1459,6 @@ class RandomForestRegressor(ForestRegressor): The importance corresponds with the average decrease in r2_score across all tree when a feature is permuted or shuffled. - scoring : string, callable, or None, default=None - Model evaluation used if feature_importance_type = "permutation". - A string (see model evaluation documentation) or a scorer callable - object / function with signature `scorer(estimator, X, y)`. - Defaults to accuracy for classification and `r2_score` for regression. - Attributes ---------- base_estimator_ : DecisionTreeRegressor @@ -1571,8 +1554,7 @@ def __init__(self, warm_start=False, ccp_alpha=0.0, max_samples=None, - feature_importances_type='impurity', - scoring=None): + feature_importances_type='impurity'): super().__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, @@ -1588,8 +1570,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - feature_importances_type=feature_importances_type, - scoring=scoring) + feature_importances_type=feature_importances_type) self.criterion = criterion self.max_depth = max_depth @@ -1803,12 +1784,6 @@ class ExtraTreesClassifier(ForestClassifier): The importance corresponds with the average decrease in r2_score across all tree when a feature is permuted or shuffled. - scoring : string, callable, or None, default=None - Model evaluation used if feature_importance_type = "permutation". - A string (see model evaluation documentation) or a scorer callable - object / function with signature `scorer(estimator, X, y)`. - Defaults to accuracy for classification and `r2_score` for regression. - Attributes ---------- base_estimator_ : ExtraTreesClassifier @@ -1904,8 +1879,7 @@ def __init__(self, class_weight=None, ccp_alpha=0.0, max_samples=None, - feature_importances_type='impurity', - scoring=None): + feature_importances_type='impurity'): super().__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, @@ -1922,8 +1896,7 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - feature_importances_type=feature_importances_type, - scoring=scoring) + feature_importances_type=feature_importances_type) self.criterion = criterion self.max_depth = max_depth @@ -2115,12 +2088,6 @@ class ExtraTreesRegressor(ForestRegressor): The importance corresponds with the average decrease in performance across all tree when a feature is permuted or shuffled. - scoring : string, callable, or None, default=None - Model evaluation used if feature_importance_type = "permutation". - A string (see model evaluation documentation) or a scorer callable - object / function with signature `scorer(estimator, X, y)`. - Defaults to accuracy for classification and `r2_score` for regression. - Attributes ---------- base_estimator_ : ExtraTreeRegressor @@ -2205,8 +2172,7 @@ def __init__(self, warm_start=False, ccp_alpha=0.0, max_samples=None, - feature_importances_type='impurity', - scoring=None): + feature_importances_type='impurity'): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, @@ -2222,8 +2188,7 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - feature_importances_type=feature_importances_type, - scoring=scoring) + feature_importances_type=feature_importances_type) self.criterion = criterion self.max_depth = max_depth @@ -2443,10 +2408,6 @@ def __init__(self, self.min_impurity_split = min_impurity_split self.sparse_output = sparse_output - def _set_oob_permutation_importance(self, X, y): - raise NotImplementedError("OOB permutation importance not supported " - "by tree embedding") - def _set_oob_score(self, X, y): raise NotImplementedError("OOB score not supported by tree embedding") diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 5781c3fdb1517..f6b17fbdcd354 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -377,11 +377,9 @@ def test_unfitted_feature_importances(name): @pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) -@pytest.mark.parametrize("dtype", (np.float64, np.float32)) -def test_oob_importances(name, dtype): - - X = X_large.astype(dtype=dtype, copy=False) - y = y_large.astype(dtype=dtype, copy=False) +def test_oob_importances(name): + # Check that oob permutation importances correctly identify that + # there are 3 important features ForestEstimator = FOREST_ESTIMATORS[name] @@ -391,7 +389,7 @@ def test_oob_importances(name, dtype): feature_importances_type="permutation", bootstrap=True, ) - clf.fit(X, y) + clf.fit(X_large, y_large) importances = clf.feature_importances_ if name in FOREST_CLASSIFIERS: imp_level = 0.025 @@ -405,11 +403,9 @@ def test_oob_importances(name, dtype): @pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) -@pytest.mark.parametrize("dtype", (np.float64, np.float32)) -def test_oob_importances_multi_class(name, dtype): - - X = X_large_multiclass.astype(dtype=dtype, copy=False) - y = y_large_multiclass.astype(dtype=dtype, copy=False) +def test_oob_importances_multi_class(name): + # Check that oob permutation importances correctly identify that + # there are 3 important features in a multi-class setting ForestEstimator = FOREST_ESTIMATORS[name] @@ -419,13 +415,17 @@ def test_oob_importances_multi_class(name, dtype): feature_importances_type="permutation", bootstrap=True, ) - clf.fit(X, y) + clf.fit(X_large_multiclass, y_large_multiclass) importances = clf.feature_importances_ + if name in FOREST_CLASSIFIERS: + imp_level = 0.025 + else: + imp_level = 0.1 - n_important = np.sum(importances > 0.05) + n_important = np.sum(importances > imp_level) assert importances.shape[0] == 10 assert n_important == 3 - assert np.all(importances[:3] > 0.05) + assert np.all(importances[:3] > imp_level) @pytest.mark.parametrize("name", FOREST_ESTIMATORS) @@ -433,27 +433,16 @@ def test_oob_importances_raise_error(name): ForestEstimator = FOREST_ESTIMATORS[name] if name in FOREST_TRANSFORMERS: - assert_raises( - TypeError, ForestEstimator, feature_importances_type="permutation" - ) - - assert_raises( - NotImplementedError, - ForestEstimator()._set_oob_permutation_importance, - X, - y, - ) + err_msg = "unexpected keyword argument 'feature_importances_type'" + with pytest.raises(TypeError, match=err_msg): + ForestEstimator(feature_importances_type="permutation") else: # No bootstrap - assert_raises( - ValueError, - ForestEstimator( - feature_importances_type="permutation", bootstrap=False - ).fit, - X, - y, - ) + err_msg = "Out of bag estimation only available if bootstrap=True" + with pytest.raises(ValueError, match=err_msg): + ForestEstimator(feature_importances_type="permutation", + bootstrap=False).fit(X, y) def check_oob_score(name, X, y, n_estimators=20): From d842f149ee0a592d37964deb8c0c5e2b41409402 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 12 Jan 2021 17:40:47 -0500 Subject: [PATCH 10/53] formatting --- sklearn/ensemble/_forest.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b5c35fb535d2c..1853898d3b8ed 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -440,7 +440,7 @@ def _get_tree_oob_performance(self, estimator, X, y, n_samples, def _set_oob_permutation_importance(self, X, y, sample_weight): """Compute feature importances from the out-of-bag samples.""" - X = check_array(X, dtype=DTYPE, accept_sparse='csr') + X = check_array(X, dtype=DTYPE, accept_sparse=False) n_samples = y.shape[0] @@ -448,8 +448,11 @@ def _set_oob_permutation_importance(self, X, y, sample_weight): n_samples, self.max_samples ) - all_imp = np.array(Parallel(n_jobs=self.n_jobs)\ - (delayed(self._get_tree_oob_performance)( + if sample_weight is None: + sample_weight = np.ones(n_samples) + + all_imp = np.array(Parallel(n_jobs=self.n_jobs)( + delayed(self._get_tree_oob_performance)( estimator, X, y, n_samples, n_samples_bootstrap, sample_weight ) for estimator in self.estimators_)) @@ -1148,7 +1151,7 @@ class RandomForestClassifier(ForestClassifier): The importance corresponds with the average decrease in R2 across all trees when a feature is permuted or shuffled. - .. versionadded: 1.0 + .. versionadded: 1.0 Attributes ---------- From 99414a59165fc513cf3a24a9d1e4644296b29d88 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 12 Jan 2021 18:06:43 -0500 Subject: [PATCH 11/53] Add random feature test --- sklearn/ensemble/tests/test_forest.py | 48 +++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index f6b17fbdcd354..f8b62d54be474 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -428,6 +428,54 @@ def test_oob_importances_multi_class(name): assert np.all(importances[:3] > imp_level) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +def test_oob_importance_ignores_random(name): + # Testing that a random feature with high cardinality registers as + # important using impurity-based feature importance but not out-of-bag + # permutation importance + + X = X_large.copy() + y = y_large.copy() + + X[:, :-1] = (X[:, :-1] > 0).astype(int) + ForestEstimator = FOREST_ESTIMATORS[name] + + # Get oob importances + clf_oob = ForestEstimator( + n_estimators=10, + random_state=0, + feature_importances_type="permutation", + bootstrap=True, + ) + clf_oob.fit(X, y) + oob_importances = clf_oob.feature_importances_ + + # Get impurity-based importances + clf_impurity = ForestEstimator( + n_estimators=10, + random_state=0, + feature_importances_type="impurity", + bootstrap=True, + ) + clf_impurity.fit(X, y) + impurity_importances = clf_impurity.feature_importances_ + + # Test importance levels + imp_level = 0.1 + if name in FOREST_CLASSIFIERS: + oob_imp_level = 0.025 + else: + oob_imp_level = 0.1 + oob_important = np.sum(oob_importances > oob_imp_level) + impurity_important = np.sum(impurity_importances > imp_level) + + assert oob_important == 3 + assert np.all(oob_importances[:3] > oob_imp_level) + assert oob_importances[-1] < oob_imp_level + assert impurity_important == 4 + assert np.all(impurity_importances[:3] > imp_level) + assert impurity_importances[-1] > imp_level + @pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_oob_importances_raise_error(name): ForestEstimator = FOREST_ESTIMATORS[name] From 2d33c43523dbe9554adcbc2e54f4b3452f9d1478 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 12 Jan 2021 18:10:05 -0500 Subject: [PATCH 12/53] formatting --- sklearn/ensemble/tests/test_forest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index f8b62d54be474..d7a5ba0e08eaa 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -476,6 +476,7 @@ def test_oob_importance_ignores_random(name): assert np.all(impurity_importances[:3] > imp_level) assert impurity_importances[-1] > imp_level + @pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_oob_importances_raise_error(name): ForestEstimator = FOREST_ESTIMATORS[name] From fa4cb3bc6dede30a1fac3c41baf77c14d38f55ff Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 26 Jan 2021 18:21:55 -0500 Subject: [PATCH 13/53] remove inspection dependency --- sklearn/ensemble/_forest.py | 38 +++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 2551a35726240..0dd6ec5cb1c94 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -436,24 +436,37 @@ def _get_tree_oob_performance(self, estimator, X, y, n_samples, """Get out-of-bag performance for a single tree""" random_state = check_random_state(estimator.random_state) + scores = np.zeros(X.shape[1]) + unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap ) - from ..inspection._permutation_importance \ - import permutation_importance + shuffling_idx = np.arange(X.shape[0]) - result = permutation_importance( - estimator, X[unsampled_indices, :], y[unsampled_indices, :], - n_repeats=1, n_jobs=self.n_jobs, random_state=random_state, - sample_weight=sample_weight[unsampled_indices] - ) + baseline = estimator.score(X[unsampled_indices, :], + y[unsampled_indices]) + + for col_idx in range(X.shape[1]): + X_permuted = X.copy() + random_state.shuffle(shuffling_idx) + if hasattr(X_permuted, "iloc"): + col = X_permuted.iloc[shuffling_idx, col_idx] + col.index = X_permuted.index + X_permuted.iloc[:, col_idx] = col + else: + X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx] - return result.importances_mean + scores[col_idx] = estimator.score( + X_permuted[unsampled_indices, :], y[unsampled_indices], + sample_weight[unsampled_indices] + ) + + return baseline - scores def _set_oob_permutation_importance(self, X, y, sample_weight): """Compute feature importances from the out-of-bag samples.""" - X = check_array(X, dtype=DTYPE, accept_sparse=False) + X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_samples = y.shape[0] @@ -475,13 +488,6 @@ def _set_oob_permutation_importance(self, X, y, sample_weight): importances=all_imp ) - # if (n_predictions == 0).any(): - # warn( - # "Some inputs do not have OOB scores. " - # "This probably means too few trees were used " - # "to compute any reliable oob estimates." - # ) - @abstractmethod def _set_oob_score_and_attributes(self, X, y): """Compute and set the OOB score and attributes. From f379d00c7db7c740ab92c1f11fbf7616e039bfb7 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Wed, 27 Jan 2021 22:23:16 -0500 Subject: [PATCH 14/53] integrate permutation imp with oob score --- sklearn/ensemble/_forest.py | 150 ++++++++++++-------------- sklearn/ensemble/tests/test_forest.py | 2 +- 2 files changed, 72 insertions(+), 80 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 0dd6ec5cb1c94..7bbfdcf320108 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -406,7 +406,7 @@ def fit(self, X, y, sample_weight=None): # Collect newly grown trees self.estimators_.extend(trees) - if self.oob_score: + if self.oob_score or (self.feature_importances_type == "permutation"): y_type = type_of_target(y) if y_type in ("multiclass-multioutput", "unknown"): # FIXME: we could consider to support multiclass-multioutput if @@ -419,10 +419,7 @@ def fit(self, X, y, sample_weight=None): f"supported: continuous, continuous-multioutput, binary, " f"multiclass, multilabel-indicator." ) - self._set_oob_score_and_attributes(X, y) - - if self.feature_importances_type == "permutation": - self._set_oob_permutation_importance(X, y, sample_weight) + self._set_oob_score_and_attributes(X, y, sample_weight) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: @@ -431,65 +428,8 @@ def fit(self, X, y, sample_weight=None): return self - def _get_tree_oob_performance(self, estimator, X, y, n_samples, - n_samples_bootstrap, sample_weight): - """Get out-of-bag performance for a single tree""" - random_state = check_random_state(estimator.random_state) - - scores = np.zeros(X.shape[1]) - - unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples, n_samples_bootstrap - ) - - shuffling_idx = np.arange(X.shape[0]) - - baseline = estimator.score(X[unsampled_indices, :], - y[unsampled_indices]) - - for col_idx in range(X.shape[1]): - X_permuted = X.copy() - random_state.shuffle(shuffling_idx) - if hasattr(X_permuted, "iloc"): - col = X_permuted.iloc[shuffling_idx, col_idx] - col.index = X_permuted.index - X_permuted.iloc[:, col_idx] = col - else: - X_permuted[:, col_idx] = X_permuted[shuffling_idx, col_idx] - - scores[col_idx] = estimator.score( - X_permuted[unsampled_indices, :], y[unsampled_indices], - sample_weight[unsampled_indices] - ) - - return baseline - scores - - def _set_oob_permutation_importance(self, X, y, sample_weight): - """Compute feature importances from the out-of-bag samples.""" - X = check_array(X, dtype=DTYPE, accept_sparse='csr') - - n_samples = y.shape[0] - - n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples, self.max_samples - ) - - if sample_weight is None: - sample_weight = np.ones(n_samples) - - all_imp = np.array(Parallel(n_jobs=self.n_jobs)( - delayed(self._get_tree_oob_performance)( - estimator, X, y, n_samples, n_samples_bootstrap, sample_weight - ) for estimator in self.estimators_)) - - self._oob_permutation_importance = Bunch( - importances_mean=np.mean(all_imp, axis=0), - importances_std=np.std(all_imp, axis=0), - importances=all_imp - ) - @abstractmethod - def _set_oob_score_and_attributes(self, X, y): + def _set_oob_score_and_attributes(self, X, y, sample_weight): """Compute and set the OOB score and attributes. Parameters @@ -498,9 +438,11 @@ def _set_oob_score_and_attributes(self, X, y): The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. + sample_weight : ndarray of shape (n_samples,) + Sample weights. """ - def _compute_oob_predictions(self, X, y): + def _compute_oob_predictions_and_importances(self, X, y, sample_weight): """Compute and set the OOB score. Parameters @@ -509,6 +451,8 @@ def _compute_oob_predictions(self, X, y): The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. + sample_weight : ndarray of shape (n_samples,) + Sample weights. Returns ------- @@ -517,8 +461,9 @@ def _compute_oob_predictions(self, X, y): The OOB predictions. """ X = check_array(X, dtype=DTYPE, accept_sparse='csr') + random_state = check_random_state(self.random_state) - n_samples = y.shape[0] + n_samples, n_features = X.shape n_outputs = self.n_outputs_ if is_classifier(self) and hasattr(self, "n_classes_"): # n_classes_ is a ndarray at this stage @@ -531,23 +476,43 @@ def _compute_oob_predictions(self, X, y): # the array operations compatible with the 2 settings oob_pred_shape = (n_samples, 1, n_outputs) + if self.feature_importances_type == "permutation": + oob_importances = np.zeros(shape=(n_features, self.n_estimators)) + else: + oob_importances = None + oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64) n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64) n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, self.max_samples, ) - for estimator in self.estimators_: + for idx, estimator in enumerate(self.estimators_): unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap, ) + X_oob, y_oob = X[unsampled_indices, :], y[unsampled_indices] - y_pred = self._get_oob_predictions( - estimator, X[unsampled_indices, :] - ) - oob_pred[unsampled_indices, ...] += y_pred + y_oob_pred = self._get_oob_predictions(estimator, X_oob) + oob_pred[unsampled_indices, ...] += y_oob_pred n_oob_pred[unsampled_indices, :] += 1 + if self.feature_importances_type == "permutation": + # avoid circular dependence + from ..inspection import permutation_importance + + result_importances = permutation_importance( + estimator, + X_oob, + y_oob, + scoring=None, + n_repeats=1, + n_jobs=self.n_jobs, + random_state=random_state, + sample_weight=sample_weight, + ) + oob_importances[:, idx] = result_importances.importances[:, 0] + for k in range(n_outputs): if (n_oob_pred == 0).any(): warn( @@ -558,7 +523,7 @@ def _compute_oob_predictions(self, X, y): n_oob_pred[n_oob_pred == 0] = 1 oob_pred[..., k] /= n_oob_pred[..., [k]] - return oob_pred + return oob_pred, oob_importances def _validate_y_class_weight(self, y): # Default implementation @@ -690,7 +655,7 @@ def _get_oob_predictions(tree, X): y_pred = np.rollaxis(y_pred, axis=0, start=3) return y_pred - def _set_oob_score_and_attributes(self, X, y): + def _set_oob_score_and_attributes(self, X, y, sample_weight): """Compute and set the OOB score and attributes. Parameters @@ -699,17 +664,31 @@ def _set_oob_score_and_attributes(self, X, y): The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. + sample_weight : ndarray of shape (n_samples,) + Sample weights. """ - self.oob_decision_function_ = super()._compute_oob_predictions(X, y) + self.oob_decision_function_, oob_importances_ = ( + super() + ._compute_oob_predictions_and_importances(X, y, sample_weight) + ) + if self.oob_decision_function_.shape[-1] == 1: # drop the n_outputs axis if there is a single output self.oob_decision_function_ = self.oob_decision_function_.squeeze( axis=-1 ) self.oob_score_ = accuracy_score( - y, np.argmax(self.oob_decision_function_, axis=1) + y, np.argmax(self.oob_decision_function_, axis=1), + sample_weight=sample_weight ) + if self.feature_importances_type == "permutation": + self._oob_permutation_importance = Bunch( + importances_mean=np.mean(oob_importances_, axis=1), + importances_std=np.std(oob_importances_, axis=1), + importances=oob_importances_ + ) + def _validate_y_class_weight(self, y): check_classification_targets(y) @@ -983,7 +962,7 @@ def _get_oob_predictions(tree, X): y_pred = y_pred[:, np.newaxis, :] return y_pred - def _set_oob_score_and_attributes(self, X, y): + def _set_oob_score_and_attributes(self, X, y, sample_weight): """Compute and set the OOB score and attributes. Parameters @@ -992,14 +971,27 @@ def _set_oob_score_and_attributes(self, X, y): The data matrix. y : ndarray of shape (n_samples, n_outputs) The target matrix. + sample_weight : ndarray of shape (n_samples,) + Sample weights. """ - self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze( - axis=1 + self.oob_prediction_, oob_importances_ = ( + super() + ._compute_oob_predictions_and_importances(X, y, sample_weight) ) + self.oob_prediction_ = self.oob_prediction_.squeeze(axis=1) if self.oob_prediction_.shape[-1] == 1: # drop the n_outputs axis if there is a single output self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1) - self.oob_score_ = r2_score(y, self.oob_prediction_) + self.oob_score_ = r2_score( + y, self.oob_prediction_, sample_weight=sample_weight + ) + + if self.feature_importances_type == "permutation": + self._oob_permutation_importance = Bunch( + importances_mean=np.mean(oob_importances_, axis=1), + importances_std=np.std(oob_importances_, axis=1), + importances=oob_importances_ + ) def _compute_partial_dependence_recursion(self, grid, target_features): """Fast partial dependence computation. @@ -2491,7 +2483,7 @@ def __init__(self, self.min_impurity_split = min_impurity_split self.sparse_output = sparse_output - def _set_oob_score_and_attributes(self, X, y): + def _set_oob_score_and_attributes(self, X, y, sample_weight): raise NotImplementedError("OOB score not supported by tree embedding") def fit(self, X, y=None, sample_weight=None): diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 1b90cf5ce4140..e28293307adf3 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -646,7 +646,7 @@ def test_random_trees_embedding_raise_error_oob(oob_score): with pytest.raises(TypeError, match="got an unexpected keyword argument"): RandomTreesEmbedding(oob_score=oob_score) with pytest.raises(NotImplementedError, match="OOB score not supported"): - RandomTreesEmbedding()._set_oob_score_and_attributes(X, y) + RandomTreesEmbedding()._set_oob_score_and_attributes(X, y, None) def check_gridsearch(name): From a29fa773d0a2ad8c037d35993eb332c4d966990e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 28 Jan 2021 21:57:55 +0100 Subject: [PATCH 15/53] MNT refactoring based on further multiprocessing --- sklearn/ensemble/_forest.py | 520 +++++++++++++++----------- sklearn/ensemble/tests/test_forest.py | 14 +- 2 files changed, 300 insertions(+), 234 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 7bbfdcf320108..000e0ad127980 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -78,8 +78,7 @@ class calls the ``fit`` method of each sub-estimator on random samples def _get_n_samples_bootstrap(n_samples, max_samples): - """ - Get the number of samples in a bootstrap sample. + """Get the number of samples in a bootstrap sample. Parameters ---------- @@ -117,8 +116,7 @@ def _get_n_samples_bootstrap(n_samples, max_samples): def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): - """ - Private function used to _parallel_build_trees function.""" + """Private function used to _parallel_build_trees function.""" random_instance = check_random_state(random_state) sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap) @@ -127,8 +125,13 @@ def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap): def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap): + """Generate the indices of the OOB sample indices for an estimator. + + Instead of storing the OOB sample indices in the forest, it is more memory + efficient to rebuild the indices given the random state used to create the + bootstrap. This operation can be neglected in terms of computation time + compared to other processes when it is used (e.g. scoring). """ - Private function used to forest._set_oob_score function.""" sample_indices = _generate_sample_indices(random_state, n_samples, n_samples_bootstrap) sample_counts = np.bincount(sample_indices, minlength=n_samples) @@ -142,8 +145,7 @@ def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap): def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose=0, class_weight=None, n_samples_bootstrap=None): - """ - Private function used to fit a single tree in parallel.""" + """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) @@ -184,19 +186,22 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, - base_estimator, - n_estimators=100, *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - max_samples=None, - feature_importances_type='impurity'): + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -210,7 +215,7 @@ def __init__(self, self.warm_start = warm_start self.class_weight = class_weight self.max_samples = max_samples - self.feature_importances_type = feature_importances_type + self.feature_importances = feature_importances def apply(self, X): """ @@ -352,12 +357,19 @@ def fit(self, X, y, sample_weight=None): # Check parameters self._validate_estimator() - if not self.bootstrap and self.oob_score: - raise ValueError("Out of bag estimation only available" - " if bootstrap=True") + if self.feature_importances not in ("impurity", "permutation_oob"): + raise ValueError( + f"feature_importances should be 'impurity' or " + f"'permutation_oob'. Got {self.feature_importances} instead." + ) - if not self.bootstrap and ( - self.feature_importances_type == "permutation" + if not self.bootstrap and self.oob_score: + raise ValueError( + "Out of bag estimation only available if bootstrap=True" + ) + if ( + not self.bootstrap + and self.feature_importances == "permutation_oob" ): raise ValueError( "Out of bag estimation only available if bootstrap=True" @@ -406,7 +418,7 @@ def fit(self, X, y, sample_weight=None): # Collect newly grown trees self.estimators_.extend(trees) - if self.oob_score or (self.feature_importances_type == "permutation"): + if self.oob_score or (self.feature_importances == "permutation_oob"): y_type = type_of_target(y) if y_type in ("multiclass-multioutput", "unknown"): # FIXME: we could consider to support multiclass-multioutput if @@ -419,7 +431,10 @@ def fit(self, X, y, sample_weight=None): f"supported: continuous, continuous-multioutput, binary, " f"multiclass, multilabel-indicator." ) - self._set_oob_score_and_attributes(X, y, sample_weight) + if self.oob_score: + self._set_oob_score_and_attributes(X, y, sample_weight) + if self.feature_importances == "permutation_oob": + self._set_oob_importances(X, y, sample_weight) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: @@ -436,21 +451,80 @@ def _set_oob_score_and_attributes(self, X, y, sample_weight): ---------- X : array-like of shape (n_samples, n_features) The data matrix. + y : ndarray of shape (n_samples, n_outputs) The target matrix. + sample_weight : ndarray of shape (n_samples,) Sample weights. """ - def _compute_oob_predictions_and_importances(self, X, y, sample_weight): - """Compute and set the OOB score. + def _set_oob_importances(self, X, y, sample_weight): + """Compute and set importances by permuting features using OOB samples. Parameters ---------- X : array-like of shape (n_samples, n_features) The data matrix. + y : ndarray of shape (n_samples, n_outputs) The target matrix. + + sample_weight : ndarray of shape (n_samples,) + Sample weights. + + Returns + ------- + oob_importance : ndarray of shape (n_features, n_estimators) + Feature importances using OOB samples. + """ + # avoid circular dependence + from ..inspection import permutation_importance + + X = check_array(X, dtype=DTYPE, accept_sparse="csr") + random_state = check_random_state(self.random_state) + + n_samples, n_features = X.shape + oob_importances = np.zeros(shape=(n_features, self.n_estimators)) + + n_samples_bootstrap = _get_n_samples_bootstrap( + n_samples, + self.max_samples, + ) + for idx, estimator in enumerate(self.estimators_): + unsampled_indices = _generate_unsampled_indices( + estimator.random_state, + n_samples, + n_samples_bootstrap, + ) + X_oob, y_oob = X[unsampled_indices, :], y[unsampled_indices] + + result_importances = permutation_importance( + estimator, + X_oob, + y_oob, + scoring=None, + n_repeats=1, + n_jobs=1, + random_state=random_state, + sample_weight=sample_weight, + ) + oob_importances[:, idx] = result_importances.importances[:, 0] + + self._oob_permutation_importance = Bunch( + importances_mean=np.mean(oob_importances, axis=1), + importances_std=np.std(oob_importances, axis=1), + importances=oob_importances, + ) + + def _compute_oob_predictions(self, X): + """Compute and accumulate predictions of OOB samples. + + Parameters + ---------- + X : array-like of shape (n_samples, n_features) + The data matrix. + sample_weight : ndarray of shape (n_samples,) Sample weights. @@ -461,9 +535,8 @@ def _compute_oob_predictions_and_importances(self, X, y, sample_weight): The OOB predictions. """ X = check_array(X, dtype=DTYPE, accept_sparse='csr') - random_state = check_random_state(self.random_state) - n_samples, n_features = X.shape + n_samples = X.shape[0] n_outputs = self.n_outputs_ if is_classifier(self) and hasattr(self, "n_classes_"): # n_classes_ is a ndarray at this stage @@ -476,11 +549,6 @@ def _compute_oob_predictions_and_importances(self, X, y, sample_weight): # the array operations compatible with the 2 settings oob_pred_shape = (n_samples, 1, n_outputs) - if self.feature_importances_type == "permutation": - oob_importances = np.zeros(shape=(n_features, self.n_estimators)) - else: - oob_importances = None - oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64) n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64) @@ -491,28 +559,12 @@ def _compute_oob_predictions_and_importances(self, X, y, sample_weight): unsampled_indices = _generate_unsampled_indices( estimator.random_state, n_samples, n_samples_bootstrap, ) - X_oob, y_oob = X[unsampled_indices, :], y[unsampled_indices] + X_oob = X[unsampled_indices, :] y_oob_pred = self._get_oob_predictions(estimator, X_oob) oob_pred[unsampled_indices, ...] += y_oob_pred n_oob_pred[unsampled_indices, :] += 1 - if self.feature_importances_type == "permutation": - # avoid circular dependence - from ..inspection import permutation_importance - - result_importances = permutation_importance( - estimator, - X_oob, - y_oob, - scoring=None, - n_repeats=1, - n_jobs=self.n_jobs, - random_state=random_state, - sample_weight=sample_weight, - ) - oob_importances[:, idx] = result_importances.importances[:, 0] - for k in range(n_outputs): if (n_oob_pred == 0).any(): warn( @@ -523,7 +575,7 @@ def _compute_oob_predictions_and_importances(self, X, y, sample_weight): n_oob_pred[n_oob_pred == 0] = 1 oob_pred[..., k] /= n_oob_pred[..., [k]] - return oob_pred, oob_importances + return oob_pred def _validate_y_class_weight(self, y): # Default implementation @@ -559,20 +611,29 @@ def feature_importances_(self): """ check_is_fitted(self) - if self.feature_importances_type == 'permutation': - return self._oob_permutation_importance.importances_mean - - all_importances = Parallel(n_jobs=self.n_jobs, - **_joblib_parallel_args(prefer='threads'))( - delayed(getattr)(tree, 'feature_importances_') - for tree in self.estimators_ if tree.tree_.node_count > 1) - - if not all_importances: - return np.zeros(self.n_features_, dtype=np.float64) + if self.feature_importances == "permutation_oob": + feature_importances_ = \ + self._oob_permutation_importance.importances_mean + else: # impurity-based feature importance + parallel_args = { + **_joblib_parallel_args(prefer="threads"), + "n_jobs": self.n_jobs + } + all_importances = Parallel(**parallel_args)( + delayed(getattr)(tree, 'feature_importances_') + for tree in self.estimators_ if tree.tree_.node_count > 1 + ) - all_importances = np.mean(all_importances, - axis=0, dtype=np.float64) - return all_importances / np.sum(all_importances) + if not all_importances: + feature_importances_ = np.zeros( + self.n_features_, dtype=np.float64 + ) + else: + feature_importances_ = np.mean( + all_importances, axis=0, dtype=np.float64 + ) + feature_importances_ /= np.sum(feature_importances_) + return feature_importances_ def _accumulate_prediction(predict, X, out, lock): @@ -600,19 +661,22 @@ class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, - base_estimator, - n_estimators=100, *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - max_samples=None, - feature_importances_type='impurity'): + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator, n_estimators=n_estimators, @@ -625,7 +689,8 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - feature_importances_type=feature_importances_type) + feature_importances=feature_importances, + ) @staticmethod def _get_oob_predictions(tree, X): @@ -662,33 +727,24 @@ def _set_oob_score_and_attributes(self, X, y, sample_weight): ---------- X : array-like of shape (n_samples, n_features) The data matrix. + y : ndarray of shape (n_samples, n_outputs) The target matrix. + sample_weight : ndarray of shape (n_samples,) Sample weights. """ - self.oob_decision_function_, oob_importances_ = ( - super() - ._compute_oob_predictions_and_importances(X, y, sample_weight) - ) - - if self.oob_decision_function_.shape[-1] == 1: + oob_predictions = super()._compute_oob_predictions(X) + if oob_predictions.shape[-1] == 1: # drop the n_outputs axis if there is a single output - self.oob_decision_function_ = self.oob_decision_function_.squeeze( - axis=-1 - ) + oob_predictions = oob_predictions.squeeze(axis=-1) + + self.oob_decision_function_ = oob_predictions self.oob_score_ = accuracy_score( y, np.argmax(self.oob_decision_function_, axis=1), sample_weight=sample_weight ) - if self.feature_importances_type == "permutation": - self._oob_permutation_importance = Bunch( - importances_mean=np.mean(oob_importances_, axis=1), - importances_std=np.std(oob_importances_, axis=1), - importances=oob_importances_ - ) - def _validate_y_class_weight(self, y): check_classification_targets(y) @@ -869,18 +925,21 @@ class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, - base_estimator, - n_estimators=100, *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - max_samples=None, - feature_importances_type='impurity'): + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator, n_estimators=n_estimators, @@ -892,7 +951,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - feature_importances_type=feature_importances_type) + feature_importances=feature_importances, + ) def predict(self, X): """ @@ -974,25 +1034,15 @@ def _set_oob_score_and_attributes(self, X, y, sample_weight): sample_weight : ndarray of shape (n_samples,) Sample weights. """ - self.oob_prediction_, oob_importances_ = ( - super() - ._compute_oob_predictions_and_importances(X, y, sample_weight) - ) - self.oob_prediction_ = self.oob_prediction_.squeeze(axis=1) - if self.oob_prediction_.shape[-1] == 1: + oob_predictions = super()._compute_oob_predictions(X).squeeze(axis=1) + if oob_predictions.shape[-1] == 1: # drop the n_outputs axis if there is a single output - self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1) + oob_predictions = oob_predictions.squeeze(axis=-1) + self.oob_prediction_ = oob_predictions self.oob_score_ = r2_score( y, self.oob_prediction_, sample_weight=sample_weight ) - if self.feature_importances_type == "permutation": - self._oob_permutation_importance = Bunch( - importances_mean=np.mean(oob_importances_, axis=1), - importances_std=np.std(oob_importances_, axis=1), - importances=oob_importances_ - ) - def _compute_partial_dependence_recursion(self, grid, target_features): """Fast partial dependence computation. @@ -1212,14 +1262,14 @@ class RandomForestClassifier(ForestClassifier): .. versionadded:: 0.22 - feature_importances_type : {"impurity", "permutation"}, default="impurity" + feature_importances : {"impurity", "permutation_oob"}, default="impurity" The type of feature importance to compute: - If "impurity", then gets impurity-based feature importance. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. - - If "permutation", then gets out-of-bag permutation importance. + - If "permutation_oob", then gets out-of-bag permutation importance. The importance corresponds with the average decrease in R2 across all trees when a feature is permuted or shuffled. @@ -1251,11 +1301,11 @@ class labels (multi-output problem). feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. - See ``feature_importances_type`` for how these are calculated. + See ``feature_importances`` for how these are calculated. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). - Consider setting ``feature_importances_type="permutation"`` or using + Consider setting ``feature_importances="permutation_oob"`` or using :func:`sklearn.inspection.permutation_importance` as an alternative. oob_score_ : float @@ -1307,27 +1357,30 @@ class labels (multi-output problem). [1] """ @_deprecate_positional_args - def __init__(self, - n_estimators=100, *, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - feature_importances_type='impurity'): + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -1344,7 +1397,8 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - feature_importances_type=feature_importances_type) + feature_importances=feature_importances, + ) self.criterion = criterion self.max_depth = max_depth @@ -1523,14 +1577,14 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 - feature_importances_type : {"impurity", "permutation"}, default="impurity" + feature_importances : {"impurity", "permutation_oob"}, default="impurity" The type of feature importance to calculate: - If "impurity", then gets impurity-based feature importance. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. - - If "permutation", then gets out-of-bag permutation importance. + - If "permutation_oob", then gets out-of-bag permutation importance. The importance corresponds with the average decrease in r2_score across all tree when a feature is permuted or shuffled. @@ -1546,11 +1600,11 @@ class RandomForestRegressor(ForestRegressor): feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. - See ``feature_importances_type`` for how these are calculated. + See ``feature_importances`` for how these are calculated. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). - Consider setting ``feature_importances_type="permutation"`` or using + Consider setting ``feature_importances="permutation_oob"`` or using :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int @@ -1610,26 +1664,29 @@ class RandomForestRegressor(ForestRegressor): [-8.32987858] """ @_deprecate_positional_args - def __init__(self, - n_estimators=100, *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - feature_importances_type='impurity'): + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, @@ -1645,7 +1702,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - feature_importances_type=feature_importances_type) + feature_importances=feature_importances, + ) self.criterion = criterion self.max_depth = max_depth @@ -1847,14 +1905,14 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 - feature_importances_type : {"impurity", "permutation"}, default="impurity" + feature_importances : {"impurity", "permutation_oob"}, default="impurity" The type of feature importance to calculate: - If "impurity", then gets impurity-based feature importance. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. - - If "permutation", then gets out-of-bag permutation importance. + - If "permutation_oob", then gets out-of-bag permutation importance. The importance corresponds with the average decrease in r2_score across all tree when a feature is permuted or shuffled. @@ -1878,11 +1936,11 @@ class labels (multi-output problem). feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. - See ``feature_importances_type`` for how these are calculated. + See ``feature_importances`` for how these are calculated. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). - Consider setting ``feature_importances_type="permutation"`` or using + Consider setting ``feature_importances="permutation_oob"`` or using :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int @@ -1934,27 +1992,30 @@ class labels (multi-output problem). array([1]) """ @_deprecate_positional_args - def __init__(self, - n_estimators=100, *, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - feature_importances_type='impurity'): + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, @@ -1971,7 +2032,8 @@ def __init__(self, warm_start=warm_start, class_weight=class_weight, max_samples=max_samples, - feature_importances_type=feature_importances_type) + feature_importances=feature_importances, + ) self.criterion = criterion self.max_depth = max_depth @@ -2152,14 +2214,14 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 - feature_importances_type : {"impurity", "permutation"}, default="impurity" + feature_importances : {"impurity", "permutation_oob"}, default="impurity" The type of feature importance to calculate: - If "impurity", then gets impurity-based feature importance. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance. - - If "permutation", then gets out-of-bag permutation importance. + - If "permutation_oob", then gets out-of-bag permutation importance. The importance corresponds with the average decrease in performance across all tree when a feature is permuted or shuffled. @@ -2175,11 +2237,11 @@ class ExtraTreesRegressor(ForestRegressor): feature_importances_ : ndarray of shape (n_features,) The impurity-based feature importances. The higher, the more important the feature. - See ``feature_importances_type`` for how these are calculated. + See ``feature_importances`` for how these are calculated. Warning: impurity-based feature importances can be misleading for high cardinality features (many unique values). - Consider setting ``feature_importances_type="permutation"`` or using + Consider setting ``feature_importances="permutation_oob"`` or using :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int @@ -2228,26 +2290,29 @@ class ExtraTreesRegressor(ForestRegressor): 0.2708... """ @_deprecate_positional_args - def __init__(self, - n_estimators=100, *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - feature_importances_type='impurity'): + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, @@ -2263,7 +2328,8 @@ def __init__(self, verbose=verbose, warm_start=warm_start, max_samples=max_samples, - feature_importances_type=feature_importances_type) + feature_importances=feature_importances, + ) self.criterion = criterion self.max_depth = max_depth diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index e28293307adf3..09e8339b1fce6 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -388,7 +388,7 @@ def test_oob_importances(name): clf = ForestEstimator( n_estimators=10, random_state=0, - feature_importances_type="permutation", + feature_importances="permutation_oob", bootstrap=True, ) clf.fit(X_large, y_large) @@ -414,7 +414,7 @@ def test_oob_importances_multi_class(name): clf = ForestEstimator( n_estimators=10, random_state=0, - feature_importances_type="permutation", + feature_importances="permutation_oob", bootstrap=True, ) clf.fit(X_large_multiclass, y_large_multiclass) @@ -446,7 +446,7 @@ def test_oob_importance_ignores_random(name): clf_oob = ForestEstimator( n_estimators=10, random_state=0, - feature_importances_type="permutation", + feature_importances="permutation_oob", bootstrap=True, ) clf_oob.fit(X, y) @@ -456,7 +456,7 @@ def test_oob_importance_ignores_random(name): clf_impurity = ForestEstimator( n_estimators=10, random_state=0, - feature_importances_type="impurity", + feature_importances="impurity", bootstrap=True, ) clf_impurity.fit(X, y) @@ -484,15 +484,15 @@ def test_oob_importances_raise_error(name): ForestEstimator = FOREST_ESTIMATORS[name] if name in FOREST_TRANSFORMERS: - err_msg = "unexpected keyword argument 'feature_importances_type'" + err_msg = "unexpected keyword argument 'feature_importances'" with pytest.raises(TypeError, match=err_msg): - ForestEstimator(feature_importances_type="permutation") + ForestEstimator(feature_importances="permutation_oob") else: # No bootstrap err_msg = "Out of bag estimation only available if bootstrap=True" with pytest.raises(ValueError, match=err_msg): - ForestEstimator(feature_importances_type="permutation", + ForestEstimator(feature_importances="permutation_oob", bootstrap=False).fit(X, y) From 3143f732e960f111014e8b50b534c6ceeaf2c5cd Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 28 Jan 2021 22:21:52 +0100 Subject: [PATCH 16/53] doc --- examples/inspection/plot_permutation_importance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 56303b2593344..ccad921746da9 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -175,13 +175,13 @@ # %% # Finally, the permutation importance can also be calculated using the -# out-of-bag data by setting ``feature_importances_type="permutation"`` and +# out-of-bag data by setting ``feature_importances="permutation_oob"`` and # re-running the pipeline. This confirms that ``sex``` is most important # and that the random features have low importances. rf = Pipeline(steps=[ ("preprocess", preprocessing), ("classifier", RandomForestClassifier( - random_state=42, feature_importances_type="permutation")) + random_state=42, feature_importances="permutation_oob")) ]) rf.fit(X_train, y_train) From e274b390529fff1a0354adf0cdb3cf2875b5447c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 28 Jan 2021 23:01:09 +0100 Subject: [PATCH 17/53] ENH parallelize --- sklearn/ensemble/_forest.py | 77 ++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 27 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 000e0ad127980..820a25732e244 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -50,6 +50,7 @@ class calls the ``fit`` method of each sub-estimator on random samples from scipy.sparse import hstack as sparse_hstack from joblib import Parallel +from .. import config_context from ..base import is_classifier from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin from ..metrics import accuracy_score, r2_score @@ -177,6 +178,36 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, return tree +def _permutation_importances_oob( + estimator, + X, + y, + sample_weight, + n_samples, + n_samples_bootstrap, + random_state, +): + """Compute the feature permutation importance given a tree.""" + # avoid circular dependence + from ..inspection import permutation_importance + + unsampled_indices = _generate_unsampled_indices( + estimator.random_state, + n_samples, + n_samples_bootstrap, + ) + return permutation_importance( + estimator, + X[unsampled_indices, :], + y[unsampled_indices], + scoring=None, + n_repeats=1, + n_jobs=1, + random_state=random_state, + sample_weight=sample_weight, + ).importances[:, 0] + + class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): """ Base class for forests of trees. @@ -478,42 +509,34 @@ def _set_oob_importances(self, X, y, sample_weight): oob_importance : ndarray of shape (n_features, n_estimators) Feature importances using OOB samples. """ - # avoid circular dependence - from ..inspection import permutation_importance - X = check_array(X, dtype=DTYPE, accept_sparse="csr") random_state = check_random_state(self.random_state) - n_samples, n_features = X.shape - oob_importances = np.zeros(shape=(n_features, self.n_estimators)) - + n_samples = X.shape[0] n_samples_bootstrap = _get_n_samples_bootstrap( n_samples, self.max_samples, ) - for idx, estimator in enumerate(self.estimators_): - unsampled_indices = _generate_unsampled_indices( - estimator.random_state, - n_samples, - n_samples_bootstrap, - ) - X_oob, y_oob = X[unsampled_indices, :], y[unsampled_indices] - - result_importances = permutation_importance( - estimator, - X_oob, - y_oob, - scoring=None, - n_repeats=1, - n_jobs=1, - random_state=random_state, - sample_weight=sample_weight, - ) - oob_importances[:, idx] = result_importances.importances[:, 0] + + with config_context(assume_finite=True): + # avoid redundant checking performed on X in the permutation + # importance function. + oob_importances = np.transpose(Parallel(n_jobs=self.n_jobs)( + delayed(_permutation_importances_oob)( + estimator, + X, + y, + sample_weight, + n_samples, + n_samples_bootstrap, + random_state, + ) + for estimator in self.estimators_ + )) self._oob_permutation_importance = Bunch( - importances_mean=np.mean(oob_importances, axis=1), - importances_std=np.std(oob_importances, axis=1), + importances_mean=oob_importances.mean(axis=1), + importances_std=oob_importances.std(axis=1), importances=oob_importances, ) From 45ac1b51453adb8c8b7fd3f2405d211b82cfe494 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 28 Jan 2021 23:02:41 +0100 Subject: [PATCH 18/53] doc --- sklearn/ensemble/_forest.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 820a25732e244..f4b158c4d68f5 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -503,11 +503,6 @@ def _set_oob_importances(self, X, y, sample_weight): sample_weight : ndarray of shape (n_samples,) Sample weights. - - Returns - ------- - oob_importance : ndarray of shape (n_features, n_estimators) - Feature importances using OOB samples. """ X = check_array(X, dtype=DTYPE, accept_sparse="csr") random_state = check_random_state(self.random_state) @@ -556,7 +551,7 @@ def _compute_oob_predictions(self, X): oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \ (n_samples, 1, n_outputs) The OOB predictions. - """ + """ X = check_array(X, dtype=DTYPE, accept_sparse='csr') n_samples = X.shape[0] From a3108628a3a1c20b5936f837ab13e6df9ab883b7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 28 Jan 2021 23:19:21 +0100 Subject: [PATCH 19/53] less diff --- sklearn/ensemble/_forest.py | 268 +++++++++++++++++------------------- 1 file changed, 127 insertions(+), 141 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f4b158c4d68f5..9bcd57b171ac2 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -217,22 +217,20 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): """ @abstractmethod - def __init__( - self, - base_estimator, - n_estimators=100, - *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - max_samples=None, - feature_importances="impurity", - ): + def __init__(self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + feature_importances="impurity"): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, @@ -679,22 +677,20 @@ class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ @abstractmethod - def __init__( - self, - base_estimator, - n_estimators=100, - *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - max_samples=None, - feature_importances="impurity", - ): + def __init__(self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + feature_importances="impurity"): super().__init__( base_estimator, n_estimators=n_estimators, @@ -943,21 +939,19 @@ class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): """ @abstractmethod - def __init__( - self, - base_estimator, - n_estimators=100, - *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - max_samples=None, - feature_importances="impurity", - ): + def __init__(self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + max_samples=None, + feature_importances="impurity"): super().__init__( base_estimator, n_estimators=n_estimators, @@ -1375,30 +1369,28 @@ class labels (multi-output problem). [1] """ @_deprecate_positional_args - def __init__( - self, - n_estimators=100, - *, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0.0, - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - feature_importances="impurity", - ): + def __init__(self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity"): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, @@ -1682,29 +1674,27 @@ class RandomForestRegressor(ForestRegressor): [-8.32987858] """ @_deprecate_positional_args - def __init__( - self, - n_estimators=100, - *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0.0, - min_impurity_split=None, - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - feature_importances="impurity", - ): + def __init__(self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity"): super().__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, @@ -2010,30 +2000,28 @@ class labels (multi-output problem). array([1]) """ @_deprecate_positional_args - def __init__( - self, - n_estimators=100, - *, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0.0, - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - feature_importances="impurity", - ): + def __init__(self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity"): super().__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, @@ -2308,29 +2296,27 @@ class ExtraTreesRegressor(ForestRegressor): 0.2708... """ @_deprecate_positional_args - def __init__( - self, - n_estimators=100, - *, - criterion="mse", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0.0, - min_impurity_split=None, - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - feature_importances="impurity", - ): + def __init__(self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity"): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, From dec94569102feab98daaebe42df4e533245b42da Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Thu, 28 Jan 2021 23:43:48 +0100 Subject: [PATCH 20/53] improve doc --- sklearn/ensemble/_forest.py | 156 ++++++++++++++++++++++++------------ 1 file changed, 103 insertions(+), 53 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 9bcd57b171ac2..bde5981f42092 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -1277,15 +1277,17 @@ class RandomForestClassifier(ForestClassifier): feature_importances : {"impurity", "permutation_oob"}, default="impurity" The type of feature importance to compute: - - If "impurity", then gets impurity-based feature importance. + - if `"impurity"`, the mean decrease impurity (MDI) is computed. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also - known as the Gini importance. - - If "permutation_oob", then gets out-of-bag permutation importance. - The importance corresponds with the average decrease in R2 - across all trees when a feature is permuted or shuffled. + known as the Gini importance; + - if `"permutation_oob"`, the permutation feature importance is + computed using the out-of-bag samples. The importance is computed as + the average decrease of the accuracy score across all trees when a + feature is shuffled. - .. versionadded: 1.0 + .. versionadded:: 1.0 + The `"permutation_oob"` strategy was added in 1.0. Attributes ---------- @@ -1311,14 +1313,23 @@ class labels (multi-output problem). The number of outputs when ``fit`` is performed. feature_importances_ : ndarray of shape (n_features,) - The impurity-based feature importances. - The higher, the more important the feature. - See ``feature_importances`` for how these are calculated. + The feature importances computed as specified by the strategy defined + by `feature_importances` parameter. The higher, the more important the + feature. - Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). - Consider setting ``feature_importances="permutation_oob"`` or using - :func:`sklearn.inspection.permutation_importance` as an alternative. + .. warning:: + Impurity-based feature importances can be misleading for two + reasons: + + - it is biased towards high cardinality features (many unique + values); + - it is computed from the training set and could be misleading + when the estimator overfits. + + For these reasons, consider setting + `feature_importances="permutation_oob"` to use feature permutation + importances computed on the out-of-bag samples or use the function + :func:`sklearn.inspection.permutation_importance` as an alternative. oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. @@ -1588,15 +1599,19 @@ class RandomForestRegressor(ForestRegressor): .. versionadded:: 0.22 feature_importances : {"impurity", "permutation_oob"}, default="impurity" - The type of feature importance to calculate: + The type of feature importance to compute: - - If "impurity", then gets impurity-based feature importance. + - if `"impurity"`, the mean decrease impurity (MDI) is computed. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also - known as the Gini importance. - - If "permutation_oob", then gets out-of-bag permutation importance. - The importance corresponds with the average decrease in r2_score - across all tree when a feature is permuted or shuffled. + known as the Gini importance; + - if `"permutation_oob"`, the permutation feature importance is + computed using the out-of-bag samples. The importance is computed as + the average decrease of the :math:`R^2` score across all trees when a + feature is shuffled. + + .. versionadded:: 1.0 + The `"permutation_oob"` strategy was added in 1.0. Attributes ---------- @@ -1608,14 +1623,23 @@ class RandomForestRegressor(ForestRegressor): The collection of fitted sub-estimators. feature_importances_ : ndarray of shape (n_features,) - The impurity-based feature importances. - The higher, the more important the feature. - See ``feature_importances`` for how these are calculated. + The feature importances computed as specified by the strategy defined + by `feature_importances` parameter. The higher, the more important the + feature. - Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). - Consider setting ``feature_importances="permutation_oob"`` or using - :func:`sklearn.inspection.permutation_importance` as an alternative. + .. warning:: + Impurity-based feature importances can be misleading for two + reasons: + + - it is biased towards high cardinality features (many unique + values); + - it is computed from the training set and could be misleading + when the estimator overfits. + + For these reasons, consider setting + `feature_importances="permutation_oob"` to use feature permutation + importances computed on the out-of-bag samples or use the function + :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int The number of features when ``fit`` is performed. @@ -1914,15 +1938,19 @@ class ExtraTreesClassifier(ForestClassifier): .. versionadded:: 0.22 feature_importances : {"impurity", "permutation_oob"}, default="impurity" - The type of feature importance to calculate: + The type of feature importance to compute: - - If "impurity", then gets impurity-based feature importance. + - if `"impurity"`, the mean decrease impurity (MDI) is computed. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also - known as the Gini importance. - - If "permutation_oob", then gets out-of-bag permutation importance. - The importance corresponds with the average decrease in r2_score - across all tree when a feature is permuted or shuffled. + known as the Gini importance; + - if `"permutation_oob"`, the permutation feature importance is + computed using the out-of-bag samples. The importance is computed as + the average decrease of the accuracy score across all trees when a + feature is shuffled. + + .. versionadded:: 1.0 + The `"permutation_oob"` strategy was added in 1.0. Attributes ---------- @@ -1942,14 +1970,23 @@ class labels (multi-output problem). number of classes for each output (multi-output problem). feature_importances_ : ndarray of shape (n_features,) - The impurity-based feature importances. - The higher, the more important the feature. - See ``feature_importances`` for how these are calculated. + The feature importances computed as specified by the strategy defined + by `feature_importances` parameter. The higher, the more important the + feature. - Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). - Consider setting ``feature_importances="permutation_oob"`` or using - :func:`sklearn.inspection.permutation_importance` as an alternative. + .. warning:: + Impurity-based feature importances can be misleading for two + reasons: + + - it is biased towards high cardinality features (many unique + values); + - it is computed from the training set and could be misleading + when the estimator overfits. + + For these reasons, consider setting + `feature_importances="permutation_oob"` to use feature permutation + importances computed on the out-of-bag samples or use the function + :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int The number of features when ``fit`` is performed. @@ -2221,15 +2258,19 @@ class ExtraTreesRegressor(ForestRegressor): .. versionadded:: 0.22 feature_importances : {"impurity", "permutation_oob"}, default="impurity" - The type of feature importance to calculate: + The type of feature importance to compute: - - If "impurity", then gets impurity-based feature importance. + - if `"impurity"`, the mean decrease impurity (MDI) is computed. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also - known as the Gini importance. - - If "permutation_oob", then gets out-of-bag permutation importance. - The importance corresponds with the average decrease in performance - across all tree when a feature is permuted or shuffled. + known as the Gini importance; + - if `"permutation_oob"`, the permutation feature importance is + computed using the out-of-bag samples. The importance is computed as + the average decrease of the :math:`R^2` score across all trees when a + feature is shuffled. + + .. versionadded:: 1.0 + The `"permutation_oob"` strategy was added in 1.0. Attributes ---------- @@ -2241,14 +2282,23 @@ class ExtraTreesRegressor(ForestRegressor): The collection of fitted sub-estimators. feature_importances_ : ndarray of shape (n_features,) - The impurity-based feature importances. - The higher, the more important the feature. - See ``feature_importances`` for how these are calculated. - - Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). - Consider setting ``feature_importances="permutation_oob"`` or using - :func:`sklearn.inspection.permutation_importance` as an alternative. + The feature importances computed as specified by the strategy defined + by `feature_importances` parameter. The higher, the more important the + feature. + + .. warning:: + Impurity-based feature importances can be misleading for two + reasons: + + - it is biased towards high cardinality features (many unique + values); + - it is computed from the training set and could be misleading + when the estimator overfits. + + For these reasons, consider setting + `feature_importances="permutation_oob"` to use feature permutation + importances computed on the out-of-bag samples or use the function + :func:`sklearn.inspection.permutation_importance` as an alternative. n_features_ : int The number of features. From 92069089e61abc2d69693e53db5f96b63cd897a7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 00:10:26 +0100 Subject: [PATCH 21/53] TST check for features importances raised error --- sklearn/ensemble/_forest.py | 3 ++- sklearn/ensemble/tests/test_forest.py | 39 +++++++++++++++++---------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index bde5981f42092..b018a3dee66fd 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -401,7 +401,8 @@ def fit(self, X, y, sample_weight=None): and self.feature_importances == "permutation_oob" ): raise ValueError( - "Out of bag estimation only available if bootstrap=True" + "Estimating feature importance on out of bag samples only " + "available if bootstrap=True" ) random_state = check_random_state(self.random_state) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 09e8339b1fce6..42774ba0fd102 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -479,22 +479,33 @@ def test_oob_importance_ignores_random(name): assert impurity_importances[-1] > imp_level -@pytest.mark.parametrize("name", FOREST_ESTIMATORS) -def test_oob_importances_raise_error(name): - ForestEstimator = FOREST_ESTIMATORS[name] - - if name in FOREST_TRANSFORMERS: - err_msg = "unexpected keyword argument 'feature_importances'" - with pytest.raises(TypeError, match=err_msg): - ForestEstimator(feature_importances="permutation_oob") +@pytest.mark.parametrize( + "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() +) +@pytest.mark.parametrize( + "params, err_msg", + [ + ({"feature_importances": "xxx"}, + "feature_importances should be 'impurity' or 'permutation_oob'"), + ({"feature_importances": "permutation_oob", "bootstrap": False}, + "Estimating feature importance on out of bag samples only") + ] +) +def test_forest_oob_importances_error(ForestEstimator, params, err_msg): + # check that proper error messages are raised for feature_importances + # validation + estimator = ForestEstimator(**params) + with pytest.raises(ValueError, match=err_msg): + estimator.fit(X, y) - else: - # No bootstrap - err_msg = "Out of bag estimation only available if bootstrap=True" - with pytest.raises(ValueError, match=err_msg): - ForestEstimator(feature_importances="permutation_oob", - bootstrap=False).fit(X, y) +@pytest.mark.parametrize("ForestTransformer", FOREST_TRANSFORMERS.values()) +def test_forest_transformer_no_oob_importance(ForestTransformer): + # check that the forest transformer does not expose the feature_importances + # parameter + err_msg = "unexpected keyword argument 'feature_importances'" + with pytest.raises(TypeError, match=err_msg): + ForestTransformer(feature_importances="permutation_oob") @pytest.mark.parametrize("ForestClassifier", FOREST_CLASSIFIERS.values()) From 7a44e251b3ed71135aee997a8201e3062410ed42 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Fri, 29 Jan 2021 00:26:24 -0500 Subject: [PATCH 22/53] reformat tests, update example --- doc/whats_new/v1.0.rst | 5 ++ .../inspection/plot_permutation_importance.py | 71 +++++++++------- sklearn/ensemble/_forest.py | 6 +- sklearn/ensemble/tests/test_forest.py | 85 ++++++++++++++----- 4 files changed, 113 insertions(+), 54 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a00523ec2223b..ac569be5b0cb9 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -69,6 +69,11 @@ Changelog target. Additional private refactoring was performed. :pr:`19162` by :user:`Guillaume Lemaitre `. +- |Feature| Implement out-of-bag permutation importances in all classifiers + and regressors that descend from :class:`ensemble.BaseForest`. Accessible + by setting `feature_importances="permutation"`. + :pr:`18603` by :user:`Robert Robison `. + :mod:`sklearn.feature_extraction` ................................. diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index ccad921746da9..78f710b95718a 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -136,10 +136,49 @@ fig.tight_layout() plt.show() +# %% +# Alternatively, the permutation importances can be calculated using the +# out-of-bag data by setting ``feature_importances="permutation_oob"``. This +# shows that the low cardinality categorical feature, ``sex`` is the most +# important. It gives both random features low importance, confirming that it +# avoids the limitations of MDI feature importances. +# +# Out-of-bag permutation importances are better than +# ``inspection.permutation_importance`` when data is limited, as it doesn't +# require a test set. It also has lower variance, since ``n_estimators`` is +# typically much larger than the ``n_repeats`` parameter in +# ``inspection.permutation_importance``. However, out-of-bag estimates are +# only computed after preprocessing. ``inspection.permutation_importance`` +# can be used to inspect at different stages. +rf = Pipeline(steps=[ + ("preprocess", preprocessing), + ("classifier", RandomForestClassifier( + random_state=42, feature_importances="permutation_oob")) +]) +rf.fit(X_train, y_train) + +ohe = (rf.named_steps['preprocess'] + .named_transformers_['cat']) +feature_names = ohe.get_feature_names(input_features=categorical_columns) +feature_names = np.r_[feature_names, numerical_columns] + +tree_feature_importances = ( + rf.named_steps['classifier'].feature_importances_) +sorted_idx = tree_feature_importances.argsort() + +y_ticks = np.arange(0, len(feature_names)) +fig, ax = plt.subplots() +ax.barh(y_ticks, tree_feature_importances[sorted_idx]) +ax.set_yticklabels(feature_names[sorted_idx]) +ax.set_yticks(y_ticks) +ax.set_title("Random Forest Feature Importances (OOB Permutation)") +fig.tight_layout() +plt.show() + # %% # As an alternative, the permutation importances of ``rf`` are computed on a -# held out test set. This shows that the low cardinality categorical feature, +# held out test set. This confirms that the low cardinality categorical feature # ``sex`` is the most important feature. # # Also note that both random features have very low importances (close to 0) as @@ -172,33 +211,3 @@ ax.set_title("Permutation Importances (train set)") fig.tight_layout() plt.show() - -# %% -# Finally, the permutation importance can also be calculated using the -# out-of-bag data by setting ``feature_importances="permutation_oob"`` and -# re-running the pipeline. This confirms that ``sex``` is most important -# and that the random features have low importances. -rf = Pipeline(steps=[ - ("preprocess", preprocessing), - ("classifier", RandomForestClassifier( - random_state=42, feature_importances="permutation_oob")) -]) -rf.fit(X_train, y_train) - -ohe = (rf.named_steps['preprocess'] - .named_transformers_['cat']) -feature_names = ohe.get_feature_names(input_features=categorical_columns) -feature_names = np.r_[feature_names, numerical_columns] - -tree_feature_importances = ( - rf.named_steps['classifier'].feature_importances_) -sorted_idx = tree_feature_importances.argsort() - -y_ticks = np.arange(0, len(feature_names)) -fig, ax = plt.subplots() -ax.barh(y_ticks, tree_feature_importances[sorted_idx]) -ax.set_yticklabels(feature_names[sorted_idx]) -ax.set_yticks(y_ticks) -ax.set_title("Random Forest Feature Importances (OOB Permutation)") -fig.tight_layout() -plt.show() diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index b018a3dee66fd..a09f0870ef0dd 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -196,6 +196,10 @@ def _permutation_importances_oob( n_samples, n_samples_bootstrap, ) + + if sample_weight is None: + sample_weight = np.ones(n_samples) + return permutation_importance( estimator, X[unsampled_indices, :], @@ -204,7 +208,7 @@ def _permutation_importances_oob( n_repeats=1, n_jobs=1, random_state=random_state, - sample_weight=sample_weight, + sample_weight=sample_weight[unsampled_indices], ).importances[:, 0] diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 42774ba0fd102..f978500d787f7 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -378,25 +378,26 @@ def test_unfitted_feature_importances(name): getattr(FOREST_ESTIMATORS[name](), 'feature_importances_') -@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) -def test_oob_importances(name): +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +def test_classifier_oob_importances(name): # Check that oob permutation importances correctly identify that # there are 3 important features - ForestEstimator = FOREST_ESTIMATORS[name] + X, y = datasets.make_classification( + n_samples=500, n_features=10, n_informative=3, n_redundant=0, + n_repeated=0, shuffle=False, random_state=0) - clf = ForestEstimator( + ForestClassifier = FOREST_CLASSIFIERS[name] + + clf = ForestClassifier( n_estimators=10, random_state=0, feature_importances="permutation_oob", bootstrap=True, ) - clf.fit(X_large, y_large) + clf.fit(X, y) importances = clf.feature_importances_ - if name in FOREST_CLASSIFIERS: - imp_level = 0.025 - else: - imp_level = 0.1 + imp_level = 0.025 n_important = np.sum(importances > imp_level) assert importances.shape[0] == 10 @@ -404,25 +405,27 @@ def test_oob_importances(name): assert np.all(importances[:3] > imp_level) -@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) -def test_oob_importances_multi_class(name): +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +def test_regressor_oob_importances(name): # Check that oob permutation importances correctly identify that - # there are 3 important features in a multi-class setting + # there are 3 important features - ForestEstimator = FOREST_ESTIMATORS[name] + X, y = datasets.make_regression( + n_samples=500, n_features=10, shuffle=False, + n_informative=3, random_state=100 + ) - clf = ForestEstimator( + ForestRegressor = FOREST_REGRESSORS[name] + + clf = ForestRegressor( n_estimators=10, random_state=0, feature_importances="permutation_oob", bootstrap=True, ) - clf.fit(X_large_multiclass, y_large_multiclass) + clf.fit(X, y) importances = clf.feature_importances_ - if name in FOREST_CLASSIFIERS: - imp_level = 0.025 - else: - imp_level = 0.1 + imp_level = 0.01 n_important = np.sum(importances > imp_level) assert importances.shape[0] == 10 @@ -436,13 +439,16 @@ def test_oob_importance_ignores_random(name): # important using impurity-based feature importance but not out-of-bag # permutation importance - X = X_large.copy() - y = y_large.copy() + X, y = datasets.make_classification( + n_samples=500, n_features=10, n_informative=3, n_redundant=0, + n_repeated=0, shuffle=False, random_state=0) + # Dichotomize all except for the last feature so that one non-informative + # feature has high cardinality while all other features are binary X[:, :-1] = (X[:, :-1] > 0).astype(int) - ForestEstimator = FOREST_ESTIMATORS[name] # Get oob importances + ForestEstimator = FOREST_ESTIMATORS[name] clf_oob = ForestEstimator( n_estimators=10, random_state=0, @@ -479,6 +485,41 @@ def test_oob_importance_ignores_random(name): assert impurity_importances[-1] > imp_level +@pytest.mark.parametrize( + "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() +) +def test_default_sample_weights_oob(ForestEstimator): + # Check that setting sample_weight to np.ones(...) is same as default + n_samples = 500 + X, y = datasets.make_classification( + n_samples=n_samples, n_features=10, n_informative=3, n_redundant=0, + n_repeated=0, shuffle=False, random_state=0) + + # Using default sample_weight + clf_oob_default = ForestEstimator( + n_estimators=10, + random_state=0, + oob_score=True, + feature_importances="permutation_oob", + bootstrap=True, + ) + clf_oob_default.fit(X, y, sample_weight=None) + + # Using np.ones(...) + clf_oob_numpy = ForestEstimator( + n_estimators=10, + random_state=0, + oob_score=True, + feature_importances="permutation_oob", + bootstrap=True, + ) + clf_oob_numpy.fit(X, y, sample_weight=np.ones(n_samples)) + + assert clf_oob_default.oob_score_ == clf_oob_numpy.oob_score_ + assert np.all(clf_oob_default.feature_importances_ == + clf_oob_numpy.feature_importances_) + + @pytest.mark.parametrize( "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() ) From d5849d4a2155e24fa9d15e53a2fde2b0be52c910 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 17:28:13 +0100 Subject: [PATCH 23/53] DOC rework the example --- doc/whats_new/v1.0.rst | 9 +- .../inspection/plot_permutation_importance.py | 203 +++++++++++------- 2 files changed, 134 insertions(+), 78 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index ac569be5b0cb9..774e37967e502 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -69,9 +69,12 @@ Changelog target. Additional private refactoring was performed. :pr:`19162` by :user:`Guillaume Lemaitre `. -- |Feature| Implement out-of-bag permutation importances in all classifiers - and regressors that descend from :class:`ensemble.BaseForest`. Accessible - by setting `feature_importances="permutation"`. +- |Feature| Implement out-of-bag feature permutation importances by setting + the parameter `feature_importances="permutation_oob"` in + :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestRegressor`, + :class:`ensemble.ExtraTreesClassifier`, and + :class:`ensemble.ExtraTreesRegressor`. :pr:`18603` by :user:`Robert Robison `. :mod:`sklearn.feature_extraction` diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 78f710b95718a..539e4b030bccd 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -1,11 +1,13 @@ """ -================================================================ -Permutation Importance vs Random Forest Feature Importance (MDI) -================================================================ +====================================================== +Permutation Importance vs Mean Decrease Impurity (MDI) +====================================================== -In this example, we will compare the impurity-based feature importance of -:class:`~sklearn.ensemble.RandomForestClassifier` with the -permutation importance on the titanic dataset using +In this example, we will compare the impurity-based feature importance, +available by default in :class:`~sklearn.ensemble.RandomForestClassifier`, +with the permutation importance on the titanic dataset. This latter strategy +can be computed from two different manner: (i) using the out-of-bag samples +from the random-forest or (ii) by using a held-out dataset and the function :func:`~sklearn.inspection.permutation_importance`. We will show that the impurity-based feature importance can inflate the importance of numerical features. @@ -25,23 +27,14 @@ """ print(__doc__) import matplotlib.pyplot as plt -import numpy as np - -from sklearn.datasets import fetch_openml -from sklearn.ensemble import RandomForestClassifier -from sklearn.impute import SimpleImputer -from sklearn.inspection import permutation_importance -from sklearn.compose import ColumnTransformer -from sklearn.model_selection import train_test_split -from sklearn.pipeline import Pipeline -from sklearn.preprocessing import OneHotEncoder - +import sklearn +sklearn.set_config(display="diagram") # %% # Data Loading and Feature Engineering # ------------------------------------ -# Let's use pandas to load a copy of the titanic dataset. The following shows -# how to apply separate preprocessing on numerical and categorical features. +# We will use :func:`~sklearn.datasets.fetch_openml` to fetch the titanic +# dataset from OpenML and load it into a pandas dataframe. # # We further include two random variables that are not correlated in any way # with the target variable (``survived``): @@ -50,11 +43,16 @@ # values as records). # - ``random_cat`` is a low cardinality categorical variable (3 possible # values). +from sklearn.datasets import fetch_openml +import numpy as np + X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) rng = np.random.RandomState(seed=42) X['random_cat'] = rng.randint(3, size=X.shape[0]) X['random_num'] = rng.randn(X.shape[0]) +# %% +from sklearn.model_selection import train_test_split categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat'] numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num'] @@ -63,6 +61,15 @@ X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, random_state=42) +# %% +# The following shows how to apply separate preprocessing on numerical and +# categorical features. +from sklearn.compose import ColumnTransformer +from sklearn.ensemble import RandomForestClassifier +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + categorical_encoder = OneHotEncoder(handle_unknown='ignore') numerical_pipe = Pipeline([ ('imputer', SimpleImputer(strategy='mean')) @@ -118,83 +125,101 @@ # therefore do not reflect the ability of feature to be useful to make # predictions that generalize to the test set (when the model has enough # capacity). +import pandas as pd + ohe = (rf.named_steps['preprocess'] .named_transformers_['cat']) feature_names = ohe.get_feature_names(input_features=categorical_columns) feature_names = np.r_[feature_names, numerical_columns] -tree_feature_importances = ( - rf.named_steps['classifier'].feature_importances_) -sorted_idx = tree_feature_importances.argsort() +tree_feature_importances = pd.Series( + rf.named_steps['classifier'].feature_importances_, + index=feature_names) +# sort the Series for the plotting +tree_feature_importances = tree_feature_importances.sort_values() -y_ticks = np.arange(0, len(feature_names)) -fig, ax = plt.subplots() -ax.barh(y_ticks, tree_feature_importances[sorted_idx]) -ax.set_yticklabels(feature_names[sorted_idx]) -ax.set_yticks(y_ticks) +ax = tree_feature_importances.plot.barh() ax.set_title("Random Forest Feature Importances (MDI)") -fig.tight_layout() +_ = ax.set_xlabel("Mean impurity decrease") plt.show() # %% -# Alternatively, the permutation importances can be calculated using the -# out-of-bag data by setting ``feature_importances="permutation_oob"``. This -# shows that the low cardinality categorical feature, ``sex`` is the most -# important. It gives both random features low importance, confirming that it -# avoids the limitations of MDI feature importances. +# Alternative to MDI using Feature Permutation Importance +# ------------------------------------------------------- +# The limitations of MDI pointed out in the previous section can be bypassed +# using an alternative strategy to estimate the feature importances. This +# strategy relies on monitoring the decrease (or not) of a given performance +# metric by randomly permutting the value of a given feature. In short, a +# predictive feature will negatively impact the score when it is randomly +# permuted while a non-predictive feature will not change the score. +# +# This feature permutation importance estimate can be computed in two different +# way: (i) by using the out-of-bag (OOB) samples in the ensemble to perform the +# permutation and the scoring or (ii) by manually splitting and handling a +# train and test set where the latter will be used with permutations. # -# Out-of-bag permutation importances are better than -# ``inspection.permutation_importance`` when data is limited, as it doesn't -# require a test set. It also has lower variance, since ``n_estimators`` is -# typically much larger than the ``n_repeats`` parameter in -# ``inspection.permutation_importance``. However, out-of-bag estimates are -# only computed after preprocessing. ``inspection.permutation_importance`` -# can be used to inspect at different stages. +# Feature Permutation Importance on Out-Of-Bag (OOB) samples +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Random-forest exposes a parameter `feature_importances` that allows to switch +# from the MDI to the permutation importance on the OOB samples. The parameter +# need to be set to `"permutation_oob"`. rf = Pipeline(steps=[ ("preprocess", preprocessing), ("classifier", RandomForestClassifier( - random_state=42, feature_importances="permutation_oob")) -]) -rf.fit(X_train, y_train) + feature_importances="permutation_oob", random_state=42)) +]).fit(X_train, y_train) -ohe = (rf.named_steps['preprocess'] - .named_transformers_['cat']) -feature_names = ohe.get_feature_names(input_features=categorical_columns) -feature_names = np.r_[feature_names, numerical_columns] - -tree_feature_importances = ( - rf.named_steps['classifier'].feature_importances_) -sorted_idx = tree_feature_importances.argsort() +# %% +# Once the forest has been train, the permutation importances have been +# estimated internally on the OOB samples. Thus, the fitted attribute +# `feature_importances_` is now displaying the mean score decrease among all +# trees of the forest for each feature. Thus, we can plot this feature +# importances and compared it with the MDI estimates. +tree_feature_importances = pd.Series( + rf.named_steps['classifier'].feature_importances_, + index=feature_names) +# sort the Series for the plotting +tree_feature_importances = tree_feature_importances.sort_values() -y_ticks = np.arange(0, len(feature_names)) -fig, ax = plt.subplots() -ax.barh(y_ticks, tree_feature_importances[sorted_idx]) -ax.set_yticklabels(feature_names[sorted_idx]) -ax.set_yticks(y_ticks) +ax = tree_feature_importances.plot.barh() ax.set_title("Random Forest Feature Importances (OOB Permutation)") -fig.tight_layout() -plt.show() - +_ = ax.set_xlabel("Mean accuracy decrease") # %% -# As an alternative, the permutation importances of ``rf`` are computed on a -# held out test set. This confirms that the low cardinality categorical feature -# ``sex`` is the most important feature. +# With this strategy, the low cardinality categorical feature, ``sex`` is the +# most important. It gives both random features low importance, confirming that +# it avoids the limitations of MDI feature importances. +# +# Feature Permutation Importance on train-test sets +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# In the previous section, we show how one can leverage the OOB samples to +# compute the permutation importance. However, this is also possible to use +# the same strategy but manipulating a train and a test sets. # -# Also note that both random features have very low importances (close to 0) as -# expected. +# We illustrate such strategy by using the function +# :func:`~sklearn.inspection.permutation_importance`. Note that this way of +# computing the feature importance is model agnostic while the previous methods +# rely on the forest models. +from sklearn.inspection import permutation_importance + result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2) -sorted_idx = result.importances_mean.argsort() +tree_feature_importances = pd.DataFrame( + result.importances.T, columns=X_test.columns) +# sort (reorder columns) the DataFrame for the plotting +tree_feature_importances = tree_feature_importances.reindex( + tree_feature_importances.mean().sort_values().index, + axis="columns") -fig, ax = plt.subplots() -ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=X_test.columns[sorted_idx]) +ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Permutation Importances (test set)") -fig.tight_layout() -plt.show() +_ = ax.set_xlabel("Accuracy decrease") # %% +# As with the permutation importance using the OOB samples, the low cardinality +# categorical feature ``sex`` is the most important feature. Also note that +# both random features have very low importances (close to 0) as expected. +# # It is also possible to compute the permutation importances on the training # set. This reveals that ``random_num`` gets a significantly higher importance # ranking than when computed on the test set. The difference between those two @@ -203,11 +228,39 @@ # re-running this example with constrained RF with min_samples_leaf=10. result = permutation_importance(rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2) -sorted_idx = result.importances_mean.argsort() +tree_feature_importances = pd.DataFrame( + result.importances.T, columns=X_test.columns) +# sort (reorder columns) the DataFrame for the plotting +tree_feature_importances = tree_feature_importances.reindex( + tree_feature_importances.mean().sort_values().index, + axis="columns") -fig, ax = plt.subplots() -ax.boxplot(result.importances[sorted_idx].T, - vert=False, labels=X_train.columns[sorted_idx]) +ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Permutation Importances (train set)") -fig.tight_layout() +ax.set_xlabel("Accuracy decrease") plt.show() + +# %% +# Final words +# ----------- +# As presented, the feature permutation importances can be computed either +# on the OOB samples or on separated datasets. +# +# While they are similar, it should be noted that the variations of the +# importances is estimated differently: the variance of the decrease of the +# score is estimated across the number of trees (i.e.`n_estimators` +# parameter) in the forest while it is estimated via the number of repeated +# permutation (i.e. `n_repeats`) in the other strategy. +# +# Therefore, using the permutation on the OOB samples could be interesting +# when a limited amount of data is at hand. Also, it might provide a faster way +# to evaluate the importances when setting the equivalence +# `n_repeats=n_estimators`. +# +# However, as shown in the previous plots, the permutation importances on the +# OOB will give a score on the random forest input features only. It means that +# this strategy does not allow to get information from original features, +# upstream from the random-forest. Computing the permutation importances on +# held-out train-test sets allows to apply or not a sequence of pre-processing +# and thus to know estimate the feature importances from any step of a +# machine-learning pipeline. From 52529004adf57b5bd90b152806cab5a529332d7c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 18:33:03 +0100 Subject: [PATCH 24/53] glitch --- examples/inspection/plot_permutation_importance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 539e4b030bccd..e3fb68827f9b6 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -248,7 +248,7 @@ # # While they are similar, it should be noted that the variations of the # importances is estimated differently: the variance of the decrease of the -# score is estimated across the number of trees (i.e.`n_estimators` +# score is estimated across the number of trees (i.e.``n_estimators`` # parameter) in the forest while it is estimated via the number of repeated # permutation (i.e. `n_repeats`) in the other strategy. # From 33953095d92510b904351d5b9b66eb6eb1a6724c Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 19:36:28 +0100 Subject: [PATCH 25/53] DOC update user guide --- doc/modules/ensemble.rst | 67 +++++++++++++++---- .../inspection/plot_permutation_importance.py | 2 +- 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 0e0aaaafaffba..8934067715e83 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -262,6 +262,15 @@ amount of time (e.g., on large datasets). Feature importance evaluation ----------------------------- +Both random-forest and extremely randomized trees estimators provides a fitted +attribute `feature_importances_` giving an estimate of the relative feature +importance. Two strategies are available to estimate the feature importances. +It can be set with the parameter `feature_importances`. The following sections +give information regarding the strategies to estimate the feature importance. + +Mean decrease in impurity (MDI) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + The relative rank (i.e. depth) of a feature used as a decision node in a tree can be used to assess the relative importance of that feature with respect to the predictability of the target variable. Features used at @@ -279,18 +288,50 @@ for feature selection. This is known as the mean decrease in impurity, or MDI. Refer to [L2014]_ for more information on MDI and feature importance evaluation with Random Forests. +This strategy corresponds to setting `feature_importances="impurity"` which is +the default values. + .. warning:: The impurity-based feature importances computed on tree-based models suffer - from two flaws that can lead to misleading conclusions. First they are - computed on statistics derived from the training dataset and therefore **do - not necessarily inform us on which features are most important to make good - predictions on held-out dataset**. Secondly, **they favor high cardinality - features**, that is features with many unique values. - :ref:`permutation_importance` is an alternative to impurity-based feature - importance that does not suffer from these flaws. These two methods of - obtaining feature importance are explored in: - :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py`. + from two flaws that can lead to misleading conclusions: + + **First**, they are computed on statistics derived from the training dataset + and therefore **do not necessarily inform us on which features are most + important to make good predictions on held-out dataset. + + **Secondly**, they favor high cardinality features**, that is features with + many unique values. + + Features importances estimated through feature permutation is an alternative + that does not suffer from these flaws. We give more details regarding this + alternative in the next section. + +Feature permutation importances on out-of-bag (OOB) samples +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +An alternative to MDI is the feature importances that uses feature permutation. +Each tree in the ensemble can be evaluated using the out-of-bag samples +[B2001]_. To know the importance of a feature, one can compute the difference +between the tree score with the original OOB sample and an OOB sample for which +the feature of interest will be permuted. When a feature is predictive, one +expects the score to decrease. If instead the score remains unchanged, the +feature is not important at predictive the target. Thus, the feature +importances corresponds to the average of the decrease of the tree score. + +This strategy can be selected by setting +`feature_importances="permutation_oob"`. + +.. note:: + + :ref:`permutation_importance` can also be evaluated on a held-out set by + manually splitting the dataset into a train and a test sets. In this case, + the permutation procedure is applied on the test set rather than on the OOB + samples. The :func:`~sklearn.inspection.permutation_importance` should be + used in this case. + +Illustration of using feature importances +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The following example shows a color-coded representation of the relative importances of each individual pixel for a face recognition task using @@ -301,16 +342,14 @@ a :class:`ExtraTreesClassifier` model. :align: center :scale: 75 -In practice those estimates are stored as an attribute named -``feature_importances_`` on the fitted model. This is an array with shape -``(n_features,)`` whose values are positive and sum to 1.0. The higher -the value, the more important is the contribution of the matching feature -to the prediction function. +MDI and the permutation feature importances are explored in: + :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py`. .. topic:: Examples: * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances_faces.py` * :ref:`sphx_glr_auto_examples_ensemble_plot_forest_importances.py` + * :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py` .. topic:: References diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index e3fb68827f9b6..56ffa0c65c87b 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -225,7 +225,7 @@ # ranking than when computed on the test set. The difference between those two # plots is a confirmation that the RF model has enough capacity to use that # random numerical feature to overfit. You can further confirm this by -# re-running this example with constrained RF with min_samples_leaf=10. +# re-running this example with constrained RF with `min_samples_leaf=10`. result = permutation_importance(rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2) tree_feature_importances = pd.DataFrame( From 81beb84d6cbf433e672d96d54298d95f8fe86137 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 21:16:45 +0100 Subject: [PATCH 26/53] DOC solve title marker --- doc/modules/ensemble.rst | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 8934067715e83..fd48bdee2eb23 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -269,7 +269,7 @@ It can be set with the parameter `feature_importances`. The following sections give information regarding the strategies to estimate the feature importance. Mean decrease in impurity (MDI) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The relative rank (i.e. depth) of a feature used as a decision node in a tree can be used to assess the relative importance of that feature with @@ -296,19 +296,18 @@ the default values. The impurity-based feature importances computed on tree-based models suffer from two flaws that can lead to misleading conclusions: - **First**, they are computed on statistics derived from the training dataset - and therefore **do not necessarily inform us on which features are most - important to make good predictions on held-out dataset. - - **Secondly**, they favor high cardinality features**, that is features with - many unique values. + - Firstly, they are computed on statistics derived from the training + dataset and therefore **do not necessarily inform us on which features are + most important to make good predictions on held-out dataset**. + - Secondly, they favor **high cardinality features**, that is features with + many unique values. Features importances estimated through feature permutation is an alternative that does not suffer from these flaws. We give more details regarding this alternative in the next section. Feature permutation importances on out-of-bag (OOB) samples -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ An alternative to MDI is the feature importances that uses feature permutation. Each tree in the ensemble can be evaluated using the out-of-bag samples @@ -331,7 +330,7 @@ This strategy can be selected by setting used in this case. Illustration of using feature importances -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The following example shows a color-coded representation of the relative importances of each individual pixel for a face recognition task using @@ -660,7 +659,7 @@ chapter on gradient boosting in [F2001]_ and is related to the parameter ``interaction.depth`` in R's gbm package where ``max_leaf_nodes == interaction.depth + 1`` . Mathematical formulation -------------------------- +------------------------ We first present GBRT for regression, and then detail the classification case. From 8129850bc3153defdbf368b6e43b82516867c10e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 21:53:40 +0100 Subject: [PATCH 27/53] DOC improve example regarding feature importance --- examples/ensemble/plot_forest_importances.py | 145 +++++++++++++------ 1 file changed, 101 insertions(+), 44 deletions(-) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index b95329c76f036..df782124d1331 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -1,60 +1,117 @@ """ ========================================= -Feature importances with forests of trees +Feature importances with forest of trees ========================================= -This examples shows the use of forests of trees to evaluate the importance of -features on an artificial classification task. The red bars are -the impurity-based feature importances of the forest, -along with their inter-trees variability. +This example shows the use of forest of trees to evaluate the importance of +features on an artificial classification task. -As expected, the plot suggests that 3 features are informative, while the -remaining are not. +We show two strategies to estimate the feature importances: (i) the +impurity-based feature importances and (ii) the permutation feature +importances on out-of-bag (OOB) samples. .. warning:: Impurity-based feature importances can be misleading for high cardinality - features (many unique values). See - :func:`sklearn.inspection.permutation_importance` as an alternative. + features (many unique values). Check the documentation of the + `feature_importances` parameter to have more details regarding the + alternative as the permutation feature importances. """ print(__doc__) - -import numpy as np +# %% import matplotlib.pyplot as plt +# %% +# We generate a synthetic dataset with only 3 informative features. We will +# explicitely not shuffle the dataset to ensure that the informative features +# correspond to the three first columns of `X`. from sklearn.datasets import make_classification -from sklearn.ensemble import ExtraTreesClassifier - -# Build a classification task using 3 informative features -X, y = make_classification(n_samples=1000, - n_features=10, - n_informative=3, - n_redundant=0, - n_repeated=0, - n_classes=2, - random_state=0, - shuffle=False) - -# Build a forest and compute the impurity-based feature importances -forest = ExtraTreesClassifier(n_estimators=250, - random_state=0) +X, y = make_classification( + n_samples=1000, n_features=10, n_informative=3, + n_redundant=0, n_repeated=0, n_classes=2, + random_state=0, shuffle=False) +feature_names = [f"#{i + 1}" for i in range(X.shape[1])] + +# %% +# Impurity-based feature importances +# ---------------------------------- +# We start by fitting a random-forest. We explicitely request to compute the +# impurity-based feature importance. Note that this is the default value. +import pandas as pd +import time +from sklearn.ensemble import RandomForestClassifier + +forest = RandomForestClassifier( + n_estimators=250, feature_importances="impurity", random_state=0) + +start_time = time.time() forest.fit(X, y) -importances = forest.feature_importances_ -std = np.std([tree.feature_importances_ for tree in forest.estimators_], - axis=0) -indices = np.argsort(importances)[::-1] - -# Print the feature ranking -print("Feature ranking:") - -for f in range(X.shape[1]): - print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) - -# Plot the impurity-based feature importances of the forest -plt.figure() -plt.title("Feature importances") -plt.bar(range(X.shape[1]), importances[indices], - color="r", yerr=std[indices], align="center") -plt.xticks(range(X.shape[1]), indices) -plt.xlim([-1, X.shape[1]]) +forest_importances = pd.Series( + forest.feature_importances_, index=feature_names) +elapsed_time = time.time() - start_time +print(f"Elapsed time to fit and compute the importances: " + f"{elapsed_time:.3f} seconds") + +# %% +# Impurity-based feature importances is relatively fast to compute. It only +# requires to fit the forest and store information regarding the different +# splits of trees. When the feature importances is requested, the splits of +# all trees are introspected to compute the mean decrease in impurity (MDI). +# +# Let's plot the feature importances ranking. +import numpy as np + +yerr = np.std([ + tree.feature_importances_ for tree in forest.estimators_], + axis=0) +ax = forest_importances.plot.bar(yerr=yerr) +ax.set_title("Feature importances using MDI") +_ = ax.set_ylabel("Mean impurity decrease") + +# %% +# We observe that the three important features are reported correctly. We also +# observe that non-informative features do not have a null importance. Indeed, +# theses features have been used by some of the trees that tend to overfit on +# some noisy samples. +# +# Feature permutation importances on OOB samples +# ---------------------------------------------- +# We will an alternative to the impurity-based feature importances based on +# feature permutation using the OOB samples. We fit a new random-forest where +# we explicitely specify to compute the permutation feature importances on OOB. +forest = RandomForestClassifier( + n_estimators=250, feature_importances="permutation_oob", random_state=0) + +start_time = time.time() +forest.fit(X, y) +forest_importances = pd.Series( + forest.feature_importances_, index=feature_names) +elapsed_time = time.time() - start_time +print(f"Elapsed time to fit and compute the importances: " + f"{elapsed_time:.3f} seconds") + +# %% +# The permutation importances is more computationally costly. Indeed, it +# requires to fit the tree and to make additional processing: each tree will +# be evaluated on its OOB sample as well as an OOB sample where features will +# be permuted. This step is costly and explains the time fitting difference +# between of the two forests. +# +# We now plot the feature importance ranking. +yerr = np.std([ + tree.feature_importances_ for tree in forest.estimators_], + axis=0) +ax = forest_importances.plot.bar(yerr=yerr) +ax.set_title("Feature importances using permutation on OOB") +ax.set_ylabel("Mean accuracy decrease") plt.show() + +# %% +# As in the impurity-based, the three most important features are detected. +# We see that non-important features have a mean decrease accuracy of zeros. +# Indeed, permuted these features did not have an impact on the score. +# Another difference between the two feature importances is the scale of the +# reported values. The permutation importances corresponds to a difference of +# scores and it is not further normalized. With impurity-based feature +# importances reported are normalized: they sum of the importances across +# features will sum to 1. From 5b41963aa28d346078f39a720c0d5693a9dd3ad6 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 22:23:15 +0100 Subject: [PATCH 28/53] DOC add new attributes importances_ --- examples/ensemble/plot_forest_importances.py | 13 +-- sklearn/ensemble/_forest.py | 108 +++++++++++++------ 2 files changed, 78 insertions(+), 43 deletions(-) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index df782124d1331..7b4c292605666 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -17,7 +17,6 @@ alternative as the permutation feature importances. """ print(__doc__) -# %% import matplotlib.pyplot as plt # %% @@ -59,12 +58,7 @@ # all trees are introspected to compute the mean decrease in impurity (MDI). # # Let's plot the feature importances ranking. -import numpy as np - -yerr = np.std([ - tree.feature_importances_ for tree in forest.estimators_], - axis=0) -ax = forest_importances.plot.bar(yerr=yerr) +ax = forest_importances.plot.bar(yerr=forest.importances_.importances_std) ax.set_title("Feature importances using MDI") _ = ax.set_ylabel("Mean impurity decrease") @@ -98,10 +92,7 @@ # between of the two forests. # # We now plot the feature importance ranking. -yerr = np.std([ - tree.feature_importances_ for tree in forest.estimators_], - axis=0) -ax = forest_importances.plot.bar(yerr=yerr) +ax = forest_importances.plot.bar(yerr=forest.importances_.importances_std) ax.set_title("Feature importances using permutation on OOB") ax.set_ylabel("Mean accuracy decrease") plt.show() diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index a09f0870ef0dd..4d0b0b581f177 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -467,8 +467,17 @@ def fit(self, X, y, sample_weight=None): ) if self.oob_score: self._set_oob_score_and_attributes(X, y, sample_weight) - if self.feature_importances == "permutation_oob": - self._set_oob_importances(X, y, sample_weight) + + if self.feature_importances == "impurity": + importances = self._compute_impurity_importances() + else: + importances = self._compute_oob_importances(X, y, sample_weight) + + self.importances_ = Bunch( + importances_mean=importances.mean(axis=1), + importances_std=importances.std(axis=1), + importances=importances, + ) # Decapsulate classes_ attributes if hasattr(self, "classes_") and self.n_outputs_ == 1: @@ -477,6 +486,21 @@ def fit(self, X, y, sample_weight=None): return self + def _compute_impurity_importances(self): + parallel_args = { + **_joblib_parallel_args(prefer="threads"), + "n_jobs": self.n_jobs + } + all_importances = Parallel(**parallel_args)( + delayed(getattr)(tree, 'feature_importances_') + for tree in self.estimators_ if tree.tree_.node_count > 1 + ) + if not all_importances: + return np.zeros( + shape=(self.n_features_, 1), dtype=np.float64 + ) + return np.transpose(all_importances) + @abstractmethod def _set_oob_score_and_attributes(self, X, y, sample_weight): """Compute and set the OOB score and attributes. @@ -493,8 +517,8 @@ def _set_oob_score_and_attributes(self, X, y, sample_weight): Sample weights. """ - def _set_oob_importances(self, X, y, sample_weight): - """Compute and set importances by permuting features using OOB samples. + def _compute_oob_importances(self, X, y, sample_weight): + """Compute importances by permuting features using OOB samples. Parameters ---------- @@ -532,11 +556,7 @@ def _set_oob_importances(self, X, y, sample_weight): for estimator in self.estimators_ )) - self._oob_permutation_importance = Bunch( - importances_mean=oob_importances.mean(axis=1), - importances_std=oob_importances.std(axis=1), - importances=oob_importances, - ) + return oob_importances def _compute_oob_predictions(self, X): """Compute and accumulate predictions of OOB samples. @@ -631,30 +651,14 @@ def feature_importances_(self): array of zeros. """ check_is_fitted(self) - if self.feature_importances == "permutation_oob": - feature_importances_ = \ - self._oob_permutation_importance.importances_mean - else: # impurity-based feature importance - parallel_args = { - **_joblib_parallel_args(prefer="threads"), - "n_jobs": self.n_jobs - } - all_importances = Parallel(**parallel_args)( - delayed(getattr)(tree, 'feature_importances_') - for tree in self.estimators_ if tree.tree_.node_count > 1 - ) - - if not all_importances: - feature_importances_ = np.zeros( - self.n_features_, dtype=np.float64 - ) - else: - feature_importances_ = np.mean( - all_importances, axis=0, dtype=np.float64 - ) - feature_importances_ /= np.sum(feature_importances_) - return feature_importances_ + return self.importances_.importances_mean + # impurity-based feature importances + importances = self.importances_.importances_mean + if np.allclose(importances, 0.0): + # avoid division by zero + return importances + return importances / importances.sum() def _accumulate_prediction(predict, X, out, lock): @@ -1336,6 +1340,16 @@ class labels (multi-output problem). importances computed on the out-of-bag samples or use the function :func:`sklearn.inspection.permutation_importance` as an alternative. + importances_ : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + importances_mean : ndarray, shape (n_features,) + Mean of feature importance over `n_estimators`. + importances_std : ndarray, shape (n_features,) + Standard deviation over `n_estimators`. + importances : ndarray, shape (n_features, n_estimators) + Raw permutation importance scores. + oob_score_ : float Score of the training dataset obtained using an out-of-bag estimate. This attribute exists only when ``oob_score`` is True. @@ -1646,6 +1660,16 @@ class RandomForestRegressor(ForestRegressor): importances computed on the out-of-bag samples or use the function :func:`sklearn.inspection.permutation_importance` as an alternative. + importances_ : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + importances_mean : ndarray, shape (n_features,) + Mean of feature importance over `n_estimators`. + importances_std : ndarray, shape (n_features,) + Standard deviation over `n_estimators`. + importances : ndarray, shape (n_features, n_estimators) + Raw permutation importance scores. + n_features_ : int The number of features when ``fit`` is performed. @@ -1993,6 +2017,16 @@ class labels (multi-output problem). importances computed on the out-of-bag samples or use the function :func:`sklearn.inspection.permutation_importance` as an alternative. + importances_ : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + importances_mean : ndarray, shape (n_features,) + Mean of feature importance over `n_estimators`. + importances_std : ndarray, shape (n_features,) + Standard deviation over `n_estimators`. + importances : ndarray, shape (n_features, n_estimators) + Raw permutation importance scores. + n_features_ : int The number of features when ``fit`` is performed. @@ -2305,6 +2339,16 @@ class ExtraTreesRegressor(ForestRegressor): importances computed on the out-of-bag samples or use the function :func:`sklearn.inspection.permutation_importance` as an alternative. + importances_ : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + importances_mean : ndarray, shape (n_features,) + Mean of feature importance over `n_estimators`. + importances_std : ndarray, shape (n_features,) + Standard deviation over `n_estimators`. + importances : ndarray, shape (n_features, n_estimators) + Raw permutation importance scores. + n_features_ : int The number of features. From c885a408ff60cf4adc185bd0114ba63990138986 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 22:28:56 +0100 Subject: [PATCH 29/53] DOC update whats new --- doc/whats_new/v1.0.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 774e37967e502..0412b499ffa6d 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -77,6 +77,12 @@ Changelog :class:`ensemble.ExtraTreesRegressor`. :pr:`18603` by :user:`Robert Robison `. +- |Feature| A new fitted attribute `importances_` has been introduced reporting + the impurity-based or permutation feature importances. This attribute is a + :class:`~sklearn.utils.Bunch` storing the raw, averaged, and variations of + the importances across all trees of the forest. + :pr:`18603` by :user:`Robert Robison `. + :mod:`sklearn.feature_extraction` ................................. From 4056f8d2537df3b0f5ec2b3f94f17e9b604f02b2 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 22:37:16 +0100 Subject: [PATCH 30/53] update docstring feature_importances_ --- sklearn/ensemble/_forest.py | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 4d0b0b581f177..0ea44ea179fbc 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -632,23 +632,32 @@ def _validate_X_predict(self, X): @property def feature_importances_(self): """ - The impurity-based feature importances. + The feature importances. - The higher, the more important the feature. - The importance of a feature is computed as the (normalized) - total reduction of the criterion brought by that feature. It is also - known as the Gini importance. + The higher, the more important the feature. There is two possible + strategies: - Warning: impurity-based feature importances can be misleading for - high cardinality features (many unique values). See - :func:`sklearn.inspection.permutation_importance` as an alternative. + - if `feature_importances="impurity"`, the impurity-based feature + importances is reported. The importance of a feature is computed as + the (normalized) total reduction of the criterion brought by that + feature. It is also known as the Gini importance; + - if `feature_importances="permutation_oob"`, the permutation feature + importances on out-of-bag samples is reported. Returns ------- feature_importances_ : ndarray of shape (n_features,) - The values of this array sum to 1, unless all trees are single node - trees consisting of only the root node, in which case it will be an - array of zeros. + The values of the feature importances corresponding to either the + impurity-based feature importances or the permutation feature + importances. + + If the impurity-based feature importances is reported, the values + of this array sum to 1, unless all trees are single node trees + consisting of only the root node, in which case it will be an array + of zeros. + + If the permutation feature importances is reported, the values + corresponds to the raw decrease of the score. """ check_is_fitted(self) if self.feature_importances == "permutation_oob": From dbba6cc643672094bb5f09291d5b9b3ab239755d Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 22:54:16 +0100 Subject: [PATCH 31/53] TST add test for importances_ attribute --- sklearn/ensemble/tests/test_forest.py | 35 +++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index f978500d787f7..bae7a615eac49 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -26,6 +26,7 @@ import joblib +from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal @@ -484,6 +485,40 @@ def test_oob_importance_ignores_random(name): assert np.all(impurity_importances[:3] > imp_level) assert impurity_importances[-1] > imp_level +@pytest.mark.parametrize( + "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() +) +@pytest.mark.parametrize( + "params", + [ + {"feature_importances": "permutation_oob", "bootstrap": True}, + {"feature_importances": "impurity"} + ] +) +def test_forest_importances_attribute(ForestEstimator, params): + # check the fitted attribute `importances_` + n_samples, n_features, n_estimators = 500, 5, 10 + X, y = make_classification( + n_samples=n_samples, n_features=n_features, random_state=42, + ) + + forest = ForestEstimator(n_estimators=n_estimators, **params).fit(X, y) + assert forest.importances_.importances_mean.shape == (n_features,) + assert forest.importances_.importances_std.shape == (n_features,) + assert forest.importances_.importances.shape == (n_features, n_estimators) + + if params["feature_importances"] == "impurity": + # impurity-based feature importances are normalized + assert_allclose( + forest.importances_.importances_mean, + (forest.feature_importances_ / + forest.importances_.importances_mean.sum()) + ) + else: + assert_allclose( + forest.importances_.importances_mean, forest.feature_importances_ + ) + @pytest.mark.parametrize( "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() From a53ba264e574f7eb27e5544ffa40de637413c741 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 22:54:34 +0100 Subject: [PATCH 32/53] PEP8 --- sklearn/ensemble/tests/test_forest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index bae7a615eac49..0558c605164b4 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -485,6 +485,7 @@ def test_oob_importance_ignores_random(name): assert np.all(impurity_importances[:3] > imp_level) assert impurity_importances[-1] > imp_level + @pytest.mark.parametrize( "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() ) From 005f7886993d903aa0d7e9f628ab6f51f2ddec97 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 23:25:51 +0100 Subject: [PATCH 33/53] TST improve couple of assert --- sklearn/ensemble/tests/test_forest.py | 157 +++++++++++++------------- 1 file changed, 78 insertions(+), 79 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 0558c605164b4..5885f86059ac2 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -30,6 +30,7 @@ from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal +from sklearn.utils._testing import assert_array_less from sklearn.utils._testing import assert_raises from sklearn.utils._testing import assert_warns from sklearn.utils._testing import assert_warns_message @@ -41,6 +42,7 @@ from sklearn.exceptions import NotFittedError from sklearn import datasets +from sklearn.base import clone, is_classifier from sklearn.decomposition import TruncatedSVD from sklearn.datasets import make_classification from sklearn.ensemble import ExtraTreesClassifier @@ -379,110 +381,103 @@ def test_unfitted_feature_importances(name): getattr(FOREST_ESTIMATORS[name](), 'feature_importances_') -@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) -def test_classifier_oob_importances(name): +@pytest.mark.parametrize( + "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() +) +def test_forest_estimator_oob_importances(ForestEstimator): # Check that oob permutation importances correctly identify that # there are 3 important features - - X, y = datasets.make_classification( - n_samples=500, n_features=10, n_informative=3, n_redundant=0, - n_repeated=0, shuffle=False, random_state=0) - - ForestClassifier = FOREST_CLASSIFIERS[name] - - clf = ForestClassifier( + n_samples, n_features, n_informative, n_redundant, n_repeated = ( + 500, 10, 3, 0, 0 + ) + estimator = ForestEstimator( n_estimators=10, - random_state=0, feature_importances="permutation_oob", bootstrap=True, - ) - clf.fit(X, y) - importances = clf.feature_importances_ - imp_level = 0.025 - - n_important = np.sum(importances > imp_level) - assert importances.shape[0] == 10 - assert n_important == 3 - assert np.all(importances[:3] > imp_level) - - -@pytest.mark.parametrize("name", FOREST_REGRESSORS) -def test_regressor_oob_importances(name): - # Check that oob permutation importances correctly identify that - # there are 3 important features - - X, y = datasets.make_regression( - n_samples=500, n_features=10, shuffle=False, - n_informative=3, random_state=100 + random_state=0, ) - ForestRegressor = FOREST_REGRESSORS[name] + if is_classifier(estimator): + X, y = datasets.make_classification( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + n_redundant=n_redundant, + n_repeated=n_repeated, + shuffle=False, + random_state=0, + ) + imp_level = 0.025 + else: + X, y = datasets.make_regression( + n_samples=n_samples, + n_features=n_features, + n_informative=n_informative, + shuffle=False, + random_state=100 + ) + imp_level = 0.1 - clf = ForestRegressor( - n_estimators=10, - random_state=0, - feature_importances="permutation_oob", - bootstrap=True, - ) - clf.fit(X, y) - importances = clf.feature_importances_ - imp_level = 0.01 + estimator.fit(X, y) + importances = estimator.feature_importances_ n_important = np.sum(importances > imp_level) - assert importances.shape[0] == 10 - assert n_important == 3 - assert np.all(importances[:3] > imp_level) + assert importances.shape == (n_features,) + assert n_important == n_informative + assert_array_less(importances[n_informative:], imp_level) -@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) -def test_oob_importance_ignores_random(name): +@pytest.mark.parametrize( + "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() +) +def test_forest_estimator_oob_importance_ignores_random(ForestEstimator): # Testing that a random feature with high cardinality registers as # important using impurity-based feature importance but not out-of-bag # permutation importance - + n_informative = 3 X, y = datasets.make_classification( - n_samples=500, n_features=10, n_informative=3, n_redundant=0, - n_repeated=0, shuffle=False, random_state=0) + n_samples=500, + n_features=10, + n_informative=n_informative, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0 + ) # Dichotomize all except for the last feature so that one non-informative # feature has high cardinality while all other features are binary X[:, :-1] = (X[:, :-1] > 0).astype(int) # Get oob importances - ForestEstimator = FOREST_ESTIMATORS[name] - clf_oob = ForestEstimator( + estimator_oob = ForestEstimator( n_estimators=10, random_state=0, feature_importances="permutation_oob", bootstrap=True, - ) - clf_oob.fit(X, y) - oob_importances = clf_oob.feature_importances_ + ).fit(X, y) + oob_importances = estimator_oob.feature_importances_ # Get impurity-based importances - clf_impurity = ForestEstimator( + estimator_impurity = ForestEstimator( n_estimators=10, random_state=0, feature_importances="impurity", bootstrap=True, - ) - clf_impurity.fit(X, y) - impurity_importances = clf_impurity.feature_importances_ + ).fit(X, y) + impurity_importances = estimator_impurity.feature_importances_ # Test importance levels imp_level = 0.1 - if name in FOREST_CLASSIFIERS: - oob_imp_level = 0.025 - else: - oob_imp_level = 0.1 + oob_imp_level = 0.025 if is_classifier(estimator_oob) else 0.1 + oob_important = np.sum(oob_importances > oob_imp_level) impurity_important = np.sum(impurity_importances > imp_level) - assert oob_important == 3 - assert np.all(oob_importances[:3] > oob_imp_level) - assert oob_importances[-1] < oob_imp_level - assert impurity_important == 4 - assert np.all(impurity_importances[:3] > imp_level) + assert oob_important == n_informative + assert_array_less(oob_importances[n_informative:], oob_imp_level) + assert impurity_important == (n_informative + 1) + assert_array_less(impurity_importances[n_informative:-1], imp_level) assert impurity_importances[-1] > imp_level @@ -528,32 +523,36 @@ def test_default_sample_weights_oob(ForestEstimator): # Check that setting sample_weight to np.ones(...) is same as default n_samples = 500 X, y = datasets.make_classification( - n_samples=n_samples, n_features=10, n_informative=3, n_redundant=0, - n_repeated=0, shuffle=False, random_state=0) + n_samples=n_samples, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, + ) # Using default sample_weight clf_oob_default = ForestEstimator( n_estimators=10, - random_state=0, oob_score=True, feature_importances="permutation_oob", bootstrap=True, + random_state=0, ) clf_oob_default.fit(X, y, sample_weight=None) # Using np.ones(...) - clf_oob_numpy = ForestEstimator( - n_estimators=10, - random_state=0, - oob_score=True, - feature_importances="permutation_oob", - bootstrap=True, - ) + clf_oob_numpy = clone(clf_oob_default) clf_oob_numpy.fit(X, y, sample_weight=np.ones(n_samples)) - assert clf_oob_default.oob_score_ == clf_oob_numpy.oob_score_ - assert np.all(clf_oob_default.feature_importances_ == - clf_oob_numpy.feature_importances_) + assert ( + clf_oob_default.oob_score_ == pytest.approx(clf_oob_numpy.oob_score_) + ) + assert_allclose( + clf_oob_default.feature_importances_, + clf_oob_numpy.feature_importances_, + ) @pytest.mark.parametrize( From cd42500760925a8055c4248f8d05f083e1364c91 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 23:32:12 +0100 Subject: [PATCH 34/53] DOC add missing documentation --- sklearn/ensemble/_forest.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index 0ea44ea179fbc..fa679ebc20637 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -487,6 +487,13 @@ def fit(self, X, y, sample_weight=None): return self def _compute_impurity_importances(self): + """Compute the impurity-based feature importances. + + Returns + ------- + importances : ndarray of shape (n_features,) + The impurity-based feature importances. + """ parallel_args = { **_joblib_parallel_args(prefer="threads"), "n_jobs": self.n_jobs @@ -530,6 +537,11 @@ def _compute_oob_importances(self, X, y, sample_weight): sample_weight : ndarray of shape (n_samples,) Sample weights. + + Returns + ------- + oob_importances : ndarray of shape (n_features, n_estimators) + The permutation feature importance compuring on OOB. """ X = check_array(X, dtype=DTYPE, accept_sparse="csr") random_state = check_random_state(self.random_state) @@ -2586,6 +2598,16 @@ class RandomTreesEmbedding(BaseForest): feature_importances_ : ndarray of shape (n_features,) The feature importances (the higher, the more important the feature). + importances_ : :class:`~sklearn.utils.Bunch` + Dictionary-like object, with the following attributes. + + importances_mean : ndarray, shape (n_features,) + Mean of feature importance over `n_estimators`. + importances_std : ndarray, shape (n_features,) + Standard deviation over `n_estimators`. + importances : ndarray, shape (n_features, n_estimators) + Raw permutation importance scores. + n_features_ : int The number of features when ``fit`` is performed. From 5f78f3a3ebb9563385d7fde36f7e5df946f7ab12 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 23:34:21 +0100 Subject: [PATCH 35/53] clean-up --- sklearn/ensemble/tests/test_forest.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 5885f86059ac2..98c4010f2cea2 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -69,11 +69,6 @@ n_samples=500, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) -# Larger multiclass classification sample used for testing feature importances -X_large_multiclass, y_large_multiclass = datasets.make_classification( - n_samples=500, n_features=10, n_informative=3, n_redundant=0, - n_repeated=0, n_classes=3, shuffle=False, random_state=42) - # also load the iris dataset # and randomly permute it iris = datasets.load_iris() From d741ea8002c037186da0d784c0f6c03e5c5cb3e7 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 23:35:38 +0100 Subject: [PATCH 36/53] small fix --- examples/ensemble/plot_forest_importances.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index 7b4c292605666..7a791575d1c02 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -1,7 +1,7 @@ """ -========================================= +======================================== Feature importances with forest of trees -========================================= +======================================== This example shows the use of forest of trees to evaluate the importance of features on an artificial classification task. From 1b4a779d7e19a94afacf885607252588e02adf4e Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Fri, 29 Jan 2021 23:54:59 +0100 Subject: [PATCH 37/53] DOC use boxplot for all plot in example --- .../inspection/plot_permutation_importance.py | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 56ffa0c65c87b..8729400ea8c78 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -132,16 +132,17 @@ feature_names = ohe.get_feature_names(input_features=categorical_columns) feature_names = np.r_[feature_names, numerical_columns] -tree_feature_importances = pd.Series( - rf.named_steps['classifier'].feature_importances_, - index=feature_names) -# sort the Series for the plotting -tree_feature_importances = tree_feature_importances.sort_values() +tree_feature_importances = pd.DataFrame( + rf.named_steps['classifier'].importances_.importances.T, + columns=feature_names) +# sort (reorder columns) the DataFrame for the plotting +tree_feature_importances = tree_feature_importances.reindex( + tree_feature_importances.mean().sort_values().index, + axis="columns") -ax = tree_feature_importances.plot.barh() +ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Random Forest Feature Importances (MDI)") -_ = ax.set_xlabel("Mean impurity decrease") -plt.show() +_ = ax.set_xlabel("Impurity decrease") # %% # Alternative to MDI using Feature Permutation Importance @@ -172,18 +173,20 @@ # %% # Once the forest has been train, the permutation importances have been # estimated internally on the OOB samples. Thus, the fitted attribute -# `feature_importances_` is now displaying the mean score decrease among all +# `importances_` is now displaying the score decrease among all # trees of the forest for each feature. Thus, we can plot this feature # importances and compared it with the MDI estimates. -tree_feature_importances = pd.Series( - rf.named_steps['classifier'].feature_importances_, - index=feature_names) -# sort the Series for the plotting -tree_feature_importances = tree_feature_importances.sort_values() +tree_feature_importances = pd.DataFrame( + rf.named_steps['classifier'].importances_.importances.T, + columns=feature_names) +# sort (reorder columns) the DataFrame for the plotting +tree_feature_importances = tree_feature_importances.reindex( + tree_feature_importances.mean().sort_values().index, + axis="columns") -ax = tree_feature_importances.plot.barh() +ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Random Forest Feature Importances (OOB Permutation)") -_ = ax.set_xlabel("Mean accuracy decrease") +_ = ax.set_xlabel("Accuracy decrease") # %% # With this strategy, the low cardinality categorical feature, ``sex`` is the From d1b0208bf11e552be16b3b05f872035d475cc268 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 30 Jan 2021 00:10:21 +0100 Subject: [PATCH 38/53] EXA solve issue cutted ylabel --- examples/inspection/plot_permutation_importance.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 8729400ea8c78..5605456ddae7d 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -142,7 +142,9 @@ ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Random Forest Feature Importances (MDI)") -_ = ax.set_xlabel("Impurity decrease") +ax.set_xlabel("Impurity decrease") +plt.tight_layout() +plt.show() # %% # Alternative to MDI using Feature Permutation Importance @@ -186,7 +188,9 @@ ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Random Forest Feature Importances (OOB Permutation)") -_ = ax.set_xlabel("Accuracy decrease") +ax.set_xlabel("Accuracy decrease") +plt.tight_layout() +plt.show() # %% # With this strategy, the low cardinality categorical feature, ``sex`` is the @@ -216,7 +220,9 @@ ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Permutation Importances (test set)") -_ = ax.set_xlabel("Accuracy decrease") +ax.set_xlabel("Accuracy decrease") +plt.tight_layout() +plt.show() # %% # As with the permutation importance using the OOB samples, the low cardinality @@ -241,6 +247,7 @@ ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Permutation Importances (train set)") ax.set_xlabel("Accuracy decrease") +plt.tight_layout() plt.show() # %% From 20ecb398d0e4c0b8825090b4f83761ada693f98f Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 30 Jan 2021 14:43:43 +0100 Subject: [PATCH 39/53] DOC add support for sample_weight in OOB score --- doc/whats_new/v1.0.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 0412b499ffa6d..7ba2207fcf6cf 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -83,6 +83,14 @@ Changelog the importances across all trees of the forest. :pr:`18603` by :user:`Robert Robison `. +- |Fix| OOB score reported in + :class:`ensemble.RandomForestClassifier`, + :class:`ensemble.RandomForestRegressor`, + :class:`ensemble.ExtraTreesClassifier`, and + :class:`ensemble.ExtraTreesRegressor` is taking into account `sample_weight` + while ignoring it previously. + :pr:`18603` by :user:`Robert Robison `. + :mod:`sklearn.feature_extraction` ................................. From 9102d5f3a66a04a9c6a8d542ac46a29816907042 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 30 Jan 2021 14:45:20 +0100 Subject: [PATCH 40/53] style code --- examples/inspection/plot_permutation_importance.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 5605456ddae7d..0fbbdce89fc49 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -258,9 +258,9 @@ # # While they are similar, it should be noted that the variations of the # importances is estimated differently: the variance of the decrease of the -# score is estimated across the number of trees (i.e.``n_estimators`` -# parameter) in the forest while it is estimated via the number of repeated -# permutation (i.e. `n_repeats`) in the other strategy. +# score is estimated across the number of trees (i.e. `n_estimators` parameter) +# in the forest while it is estimated via the number of repeated permutation +# (i.e. `n_repeats`) in the other strategy. # # Therefore, using the permutation on the OOB samples could be interesting # when a limited amount of data is at hand. Also, it might provide a faster way From 620e64353ae05563345c2f66578b1723189313b3 Mon Sep 17 00:00:00 2001 From: Guillaume Lemaitre Date: Sat, 30 Jan 2021 15:01:36 +0100 Subject: [PATCH 41/53] DOC add a note regarding correlated feature --- .../inspection/plot_permutation_importance.py | 51 +++++++++++++------ 1 file changed, 36 insertions(+), 15 deletions(-) diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 0fbbdce89fc49..6173f74067e4d 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -113,18 +113,11 @@ # %% # Tree's Feature Importance from Mean Decrease in Impurity (MDI) # -------------------------------------------------------------- -# The impurity-based feature importance ranks the numerical features to be the -# most important features. As a result, the non-predictive ``random_num`` -# variable is ranked the most important! -# -# This problem stems from two limitations of impurity-based feature -# importances: -# -# - impurity-based importances are biased towards high cardinality features; -# - impurity-based importances are computed on training set statistics and -# therefore do not reflect the ability of feature to be useful to make -# predictions that generalize to the test set (when the model has enough -# capacity). +# We plot the feature importances computed across all trees of the forest. +# We use a box plot representation to show the information. The mean is in the +# box plot would corresponds to the value reported by the fitted attribute +# `feature_importances_`. The variance is computed by taking the standard +# deviation of the feature importances across trees. import pandas as pd ohe = (rf.named_steps['preprocess'] @@ -147,6 +140,30 @@ plt.show() # %% +# The impurity-based feature importance ranks the numerical features to be the +# most important features. As a result, the non-predictive ``random_num`` +# variable is ranked the most important! +# +# This problem stems from two limitations of impurity-based feature +# importances: +# +# - impurity-based importances are biased towards high cardinality features; +# - impurity-based importances are computed on training set statistics and +# therefore do not reflect the ability of feature to be useful to make +# predictions that generalize to the test set (when the model has enough +# capacity). +# +# Another issue that we can observe is linked to correlated features. For +# instance, `sex` feature has been one-hot encoded. Therefore the categories +# `sex_female` and `sex_male` are anti-correlated. Looking at the feature +# importances of these two features, we see that the reported feature +# importances have a large standard deviation. Indeed, a tree could use either +# variables to make a split because both variable carry the exact same +# information. Thus, one tree could use the feature `sex_male` and ignore +# `sex_female` to create split while another tree could could make the opposite +# choice. We will see that the permutation feature importance does not solve +# this issue. +# # Alternative to MDI using Feature Permutation Importance # ------------------------------------------------------- # The limitations of MDI pointed out in the previous section can be bypassed @@ -193,9 +210,13 @@ plt.show() # %% -# With this strategy, the low cardinality categorical feature, ``sex`` is the -# most important. It gives both random features low importance, confirming that -# it avoids the limitations of MDI feature importances. +# With this strategy, the low cardinality categorical feature, `sex_male` or +# `sex_female` are the most important. It gives both random features low +# importance, confirming that it avoids the limitations of MDI feature +# importances. +# +# However, we still observe that these two anti-correlated features are +# suffering from a high standard deviation. # # Feature Permutation Importance on train-test sets # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 39b40bb5b5169920efe4274cfb2b4b3401aee0b6 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 2 Mar 2021 21:34:24 -0500 Subject: [PATCH 42/53] Attempting to resolve merge in examples --- examples/ensemble/plot_forest_importances.py | 95 +++++++++++--------- 1 file changed, 54 insertions(+), 41 deletions(-) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index 0751229205034..e7b10f6a05723 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -1,8 +1,9 @@ """ -======================================== -Feature importances with forest of trees -======================================== -This example shows the use of forest of trees to evaluate the importance of +========================================== +Feature importances with a forest of trees +========================================== + +This example shows the use of a forest of trees to evaluate the importance of features on an artificial classification task. We show two strategies to estimate the feature importances: (i) the impurity-based feature importances and (ii) the permutation feature @@ -21,66 +22,76 @@ # explicitely not shuffle the dataset to ensure that the informative features # correspond to the three first columns of `X`. from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split X, y = make_classification( - n_samples=1000, n_features=10, n_informative=3, - n_redundant=0, n_repeated=0, n_classes=2, - random_state=0, shuffle=False) -feature_names = [f"#{i + 1}" for i in range(X.shape[1])] + n_samples=1000, n_features=10, n_informative=3, n_redundant=0, + n_repeated=0, n_classes=2, random_state=0, shuffle=False) +X_train, X_test, y_train, y_test = train_test_split( + X, y, stratify=y, random_state=42) # %% -# Impurity-based feature importances -# ---------------------------------- -# We start by fitting a random-forest. We explicitely request to compute the -# impurity-based feature importance. Note that this is the default value. -import pandas as pd -import time +# A random forest classifier will be fitted to compute the feature importances. from sklearn.ensemble import RandomForestClassifier -forest = RandomForestClassifier( - n_estimators=250, feature_importances="impurity", random_state=0) +feature_names = [f'feature {i}' for i in range(X.shape[1])] +forest = RandomForestClassifier(random_state=0) +forest.fit(X_train, y_train) + +# %% +# Feature importance based on mean decrease in impurity +# ----------------------------------------------------- +# Feature importances are provided by the fitted attribute +# `feature_importances_` and they are computed as the mean and standard +# deviation of accumulation of the impurity decrease within each tree. +# +# .. warning:: +# Impurity-based feature importances can be misleading for high cardinality +# features (many unique values). See :ref:`permutation_importance` as +# an alternative below. +import time +import numpy as np start_time = time.time() -forest.fit(X, y) -forest_importances = pd.Series( - forest.feature_importances_, index=feature_names) +importances = forest.feature_importances_ +std = np.std([ + tree.feature_importances_ for tree in forest.estimators_], axis=0) elapsed_time = time.time() - start_time -print(f"Elapsed time to fit and compute the importances: " + +print(f"Elapsed time to compute the importances: " f"{elapsed_time:.3f} seconds") # %% -# Impurity-based feature importances is relatively fast to compute. It only -# requires to fit the forest and store information regarding the different -# splits of trees. When the feature importances is requested, the splits of -# all trees are introspected to compute the mean decrease in impurity (MDI). -# -# Let's plot the feature importances ranking. -ax = forest_importances.plot.bar(yerr=forest.importances_.importances_std) +# Let's plot the impurity-based importance. +import pandas as pd +forest_importances = pd.Series(importances, index=feature_names) + +fig, ax = plt.subplots() +forest_importances.plot.bar(yerr=std, ax=ax) ax.set_title("Feature importances using MDI") -_ = ax.set_ylabel("Mean impurity decrease") +ax.set_ylabel("Mean decrease in impurity") +fig.tight_layout() # %% -# We observe that the three important features are reported correctly. We also -# observe that non-informative features do not have a null importance. Indeed, -# theses features have been used by some of the trees that tend to overfit on -# some noisy samples. +# We observe that, as expected, the three first features are found important. # # Feature permutation importances on OOB samples # ---------------------------------------------- # We will an alternative to the impurity-based feature importances based on # feature permutation using the OOB samples. We fit a new random-forest where # we explicitely specify to compute the permutation feature importances on OOB. -forest = RandomForestClassifier( - n_estimators=250, feature_importances="permutation_oob", random_state=0) - +feature_names = [f'feature {i}' for i in range(X.shape[1])] +forest = RandomForestClassifier(feature_importances="permutation_oob", + random_state=0) start_time = time.time() -forest.fit(X, y) -forest_importances = pd.Series( - forest.feature_importances_, index=feature_names) +forest.fit(X_train, y_train) elapsed_time = time.time() - start_time -print(f"Elapsed time to fit and compute the importances: " + +print(f"Elapsed time to compute the importances: " f"{elapsed_time:.3f} seconds") +forest_importances = pd.Series(forest.feature_importances_, index=feature_names) + # %% # The permutation importances is more computationally costly. Indeed, it # requires to fit the tree and to make additional processing: each tree will @@ -89,9 +100,11 @@ # between of the two forests. # # We now plot the feature importance ranking. -ax = forest_importances.plot.bar(yerr=forest.importances_.importances_std) -ax.set_title("Feature importances using permutation on OOB") +fig, ax = plt.subplots() +forest_importances.plot.bar(yerr=forest.importances_.importances_std, ax=ax) +ax.set_title("Feature importances using permutation on full model") ax.set_ylabel("Mean accuracy decrease") +fig.tight_layout() plt.show() # %% From 83e61dc911fef335905fba1d9fc399c19b979605 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 2 Mar 2021 21:42:52 -0500 Subject: [PATCH 43/53] formatting --- examples/ensemble/plot_forest_importances.py | 3 ++- sklearn/ensemble/tests/test_forest.py | 3 --- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index e7b10f6a05723..526c62b62bbce 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -90,7 +90,8 @@ print(f"Elapsed time to compute the importances: " f"{elapsed_time:.3f} seconds") -forest_importances = pd.Series(forest.feature_importances_, index=feature_names) +forest_importances = pd.Series(forest.feature_importances_, + index=feature_names) # %% # The permutation importances is more computationally costly. Indeed, it diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 753478964df64..0a41ce5f54043 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -31,9 +31,6 @@ from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_array_less -from sklearn.utils._testing import assert_raises -from sklearn.utils._testing import assert_warns -from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import _convert_container from sklearn.utils._testing import ignore_warnings from sklearn.utils._testing import skip_if_no_parallel From 0271b51e60bb08392f01cd5a80ff1fcd0e39662a Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 2 Mar 2021 21:45:58 -0500 Subject: [PATCH 44/53] formatting --- examples/ensemble/plot_forest_importances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index 526c62b62bbce..3ecdc6d6dadcc 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -90,7 +90,7 @@ print(f"Elapsed time to compute the importances: " f"{elapsed_time:.3f} seconds") -forest_importances = pd.Series(forest.feature_importances_, +forest_importances = pd.Series(forest.feature_importances_, index=feature_names) # %% From 3289a241281b660653b47ef8efccb7ac3bb1533a Mon Sep 17 00:00:00 2001 From: Robert Robison <69172120+robert-robison@users.noreply.github.com> Date: Fri, 18 Jun 2021 23:18:05 -0400 Subject: [PATCH 45/53] Apply suggestions from code review Co-authored-by: Julien Jerphanion --- doc/modules/ensemble.rst | 15 +++++--- doc/whats_new/v1.0.rst | 4 +-- examples/ensemble/plot_forest_importances.py | 15 ++++---- .../inspection/plot_permutation_importance.py | 36 ++++++++++--------- 4 files changed, 41 insertions(+), 29 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 5c621aa2b8be7..77390f1a1dc13 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -298,15 +298,15 @@ the default values. - Firstly, they are computed on statistics derived from the training dataset and therefore **do not necessarily inform us on which features are - most important to make good predictions on held-out dataset**. + most important to make good predictions on held-out dataset**. [Strobl07]_ - Secondly, they favor **high cardinality features**, that is features with - many unique values. + many unique values. [White94]_ Features importances estimated through feature permutation is an alternative that does not suffer from these flaws. We give more details regarding this alternative in the next section. -Feature permutation importances on out-of-bag (OOB) samples +Permutation feature importances on out-of-bag (OOB) samples ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ An alternative to MDI is the feature importances that uses feature permutation. @@ -351,7 +351,14 @@ MDI and the permutation feature importances are explored in: * :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py` .. topic:: References - + .. [Strobl07] `Strobl, C., Boulesteix, AL., Zeileis, A. et al. + Bias in random forest variable importance measures: Illustrations, + sources and a solution. BMC Bioinformatics 8, 25 (2007). + `_ + .. [White94] `White, A.P., Liu, W.Z. Technical Note: + Bias in Information-Based Measures in Decision Tree Induction. + Machine Learning 15, 321–329 (1994). + `_ .. [L2014] G. Louppe, "Understanding Random Forests: From Theory to Practice", PhD Thesis, U. of Liege, 2014. diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index a592b79056d5c..d1de0509283d6 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -130,7 +130,7 @@ Changelog target. Additional private refactoring was performed. :pr:`19162` by :user:`Guillaume Lemaitre `. -- |Feature| Implement out-of-bag feature permutation importances by setting +- |Feature| Implement out-of-bag permutation feature importances by setting the parameter `feature_importances="permutation_oob"` in :class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`, @@ -144,7 +144,7 @@ Changelog the importances across all trees of the forest. :pr:`18603` by :user:`Robert Robison `. -- |Fix| OOB score reported in +- |Enhancement| OOB score reported in :class:`ensemble.RandomForestClassifier`, :class:`ensemble.RandomForestRegressor`, :class:`ensemble.ExtraTreesClassifier`, and diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index 3ecdc6d6dadcc..1b1c68fe053f3 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -5,6 +5,7 @@ This example shows the use of a forest of trees to evaluate the importance of features on an artificial classification task. + We show two strategies to estimate the feature importances: (i) the impurity-based feature importances and (ii) the permutation feature importances on out-of-bag (OOB) samples. @@ -109,11 +110,13 @@ plt.show() # %% -# As in the impurity-based, the three most important features are detected. +# As in the impurity-based case, the three most important features are detected. # We see that non-important features have a mean decrease accuracy of zeros. -# Indeed, permuted these features did not have an impact on the score. +# Hence, permuting these features did not have an impact on the score. +# # Another difference between the two feature importances is the scale of the -# reported values. The permutation importances corresponds to a difference of -# scores and it is not further normalized. With impurity-based feature -# importances reported are normalized: they sum of the importances across -# features will sum to 1. +# reported values: +# - the permutation feature importances are not normalized and simply +# correspond to a difference of scores; +# - the impurity-based feature importances reported are normalized so that +# they sum to 1. diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 6173f74067e4d..fa9c8fbc877f4 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -114,10 +114,9 @@ # Tree's Feature Importance from Mean Decrease in Impurity (MDI) # -------------------------------------------------------------- # We plot the feature importances computed across all trees of the forest. -# We use a box plot representation to show the information. The mean is in the -# box plot would corresponds to the value reported by the fitted attribute -# `feature_importances_`. The variance is computed by taking the standard -# deviation of the feature importances across trees. +# using a box plot. The mean corresponds to the value reported by the fitted +# attribute `feature_importances_`. The variance is computed on the feature +# importances across trees. import pandas as pd ohe = (rf.named_steps['preprocess'] @@ -166,12 +165,13 @@ # # Alternative to MDI using Feature Permutation Importance # ------------------------------------------------------- -# The limitations of MDI pointed out in the previous section can be bypassed -# using an alternative strategy to estimate the feature importances. This -# strategy relies on monitoring the decrease (or not) of a given performance -# metric by randomly permutting the value of a given feature. In short, a +# The limitations of MDI pointed out in the previous section can be mitigated +# using an alternative strategy: Feature Permutation Importance. +# +# This strategy relies on monitoring the decrease of a given performance +# metric by randomly permuting the value of a given feature. In short, a # predictive feature will negatively impact the score when it is randomly -# permuted while a non-predictive feature will not change the score. +# permuted whilst a non-predictive feature will not have its score changed. # # This feature permutation importance estimate can be computed in two different # way: (i) by using the out-of-bag (OOB) samples in the ensemble to perform the @@ -190,11 +190,13 @@ ]).fit(X_train, y_train) # %% -# Once the forest has been train, the permutation importances have been -# estimated internally on the OOB samples. Thus, the fitted attribute -# `importances_` is now displaying the score decrease among all -# trees of the forest for each feature. Thus, we can plot this feature -# importances and compared it with the MDI estimates. +# Once the forest has been trained, the permutation importances will be +# estimated internally on the OOB samples. In this case, the fitted +# attribute `importances_` is now gathering the score decreases +# among all trees of the forest for each feature. +# +# Thus, we can plot those feature importances and compared them with +# the MDI estimates. tree_feature_importances = pd.DataFrame( rf.named_steps['classifier'].importances_.importances.T, columns=feature_names) @@ -216,13 +218,13 @@ # importances. # # However, we still observe that these two anti-correlated features are -# suffering from a high standard deviation. +# prone to a high standard deviation. # # Feature Permutation Importance on train-test sets # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # In the previous section, we show how one can leverage the OOB samples to # compute the permutation importance. However, this is also possible to use -# the same strategy but manipulating a train and a test sets. +# the same strategy but using a train and a test sets. # # We illustrate such strategy by using the function # :func:`~sklearn.inspection.permutation_importance`. Note that this way of @@ -274,7 +276,7 @@ # %% # Final words # ----------- -# As presented, the feature permutation importances can be computed either +# As presented, the permutation feature importances can be computed either # on the OOB samples or on separated datasets. # # While they are similar, it should be noted that the variations of the From 48c7c7c905b0d03d7aedc3a55f42fa97dce50fc9 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Sat, 19 Jun 2021 21:49:18 -0400 Subject: [PATCH 46/53] Apply additional suggestions from code review --- doc/modules/ensemble.rst | 26 +++++++++++-------- .../inspection/plot_permutation_importance.py | 24 +++++++++-------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 77390f1a1dc13..1f68870b212b4 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -309,14 +309,18 @@ the default values. Permutation feature importances on out-of-bag (OOB) samples ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -An alternative to MDI is the feature importances that uses feature permutation. +An alternative to MDI is the feature importances that uses feature permutation, +referred to as **permutation feature importances**. + Each tree in the ensemble can be evaluated using the out-of-bag samples [B2001]_. To know the importance of a feature, one can compute the difference between the tree score with the original OOB sample and an OOB sample for which -the feature of interest will be permuted. When a feature is predictive, one -expects the score to decrease. If instead the score remains unchanged, the -feature is not important at predictive the target. Thus, the feature -importances corresponds to the average of the decrease of the tree score. +the feature of interest will be permuted. Thus, the permutation feature importance +corresponds to the average of the decrease of the tree score. + +When a feature has significant predictive power, one expects the score to +decrease. If instead the score remains unchanged, the feature is not important +for predicting the target. This strategy can be selected by setting `feature_importances="permutation_oob"`. @@ -351,13 +355,13 @@ MDI and the permutation feature importances are explored in: * :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py` .. topic:: References - .. [Strobl07] `Strobl, C., Boulesteix, AL., Zeileis, A. et al. - Bias in random forest variable importance measures: Illustrations, - sources and a solution. BMC Bioinformatics 8, 25 (2007). + .. [Strobl07] `Strobl, C., Boulesteix, AL., Zeileis, A. et al. + Bias in random forest variable importance measures: Illustrations, + sources and a solution. BMC Bioinformatics 8, 25 (2007). `_ - .. [White94] `White, A.P., Liu, W.Z. Technical Note: - Bias in Information-Based Measures in Decision Tree Induction. - Machine Learning 15, 321–329 (1994). + .. [White94] `White, A.P., Liu, W.Z. Technical Note: + Bias in Information-Based Measures in Decision Tree Induction. + Machine Learning 15, 321–329 (1994). `_ .. [L2014] G. Louppe, "Understanding Random Forests: From Theory to Practice", diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index fa9c8fbc877f4..a2e4af3b0d727 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -160,25 +160,27 @@ # variables to make a split because both variable carry the exact same # information. Thus, one tree could use the feature `sex_male` and ignore # `sex_female` to create split while another tree could could make the opposite -# choice. We will see that the permutation feature importance does not solve -# this issue. +# choice. # -# Alternative to MDI using Feature Permutation Importance +# We will see that the permutation feature importance does not alleviate this +# issue. +# +# Alternative to MDI using Permutation Feature Importance # ------------------------------------------------------- # The limitations of MDI pointed out in the previous section can be mitigated -# using an alternative strategy: Feature Permutation Importance. -# +# using an alternative strategy: Feature Permutation Importance. +# # This strategy relies on monitoring the decrease of a given performance # metric by randomly permuting the value of a given feature. In short, a # predictive feature will negatively impact the score when it is randomly # permuted whilst a non-predictive feature will not have its score changed. # -# This feature permutation importance estimate can be computed in two different +# This permutation feature importance estimate can be computed in two different # way: (i) by using the out-of-bag (OOB) samples in the ensemble to perform the # permutation and the scoring or (ii) by manually splitting and handling a # train and test set where the latter will be used with permutations. # -# Feature Permutation Importance on Out-Of-Bag (OOB) samples +# Permutation Feature Importance on Out-Of-Bag (OOB) samples # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Random-forest exposes a parameter `feature_importances` that allows to switch # from the MDI to the permutation importance on the OOB samples. The parameter @@ -194,8 +196,8 @@ # estimated internally on the OOB samples. In this case, the fitted # attribute `importances_` is now gathering the score decreases # among all trees of the forest for each feature. -# -# Thus, we can plot those feature importances and compared them with +# +# Thus, we can plot those feature importances and compared them with # the MDI estimates. tree_feature_importances = pd.DataFrame( rf.named_steps['classifier'].importances_.importances.T, @@ -279,8 +281,8 @@ # As presented, the permutation feature importances can be computed either # on the OOB samples or on separated datasets. # -# While they are similar, it should be noted that the variations of the -# importances is estimated differently: the variance of the decrease of the +# While those two cases are similar, it should be noted that the variations of the +# importances are estimated differently: the variance of the decrease of the # score is estimated across the number of trees (i.e. `n_estimators` parameter) # in the forest while it is estimated via the number of repeated permutation # (i.e. `n_repeats`) in the other strategy. From 0225547109d86ea5bb46bc3640d0595eeefca89e Mon Sep 17 00:00:00 2001 From: robert-robison Date: Mon, 21 Jun 2021 21:11:38 -0400 Subject: [PATCH 47/53] Apply suggestions from code review --- examples/ensemble/plot_forest_importances.py | 23 +++++++++++-------- .../inspection/plot_permutation_importance.py | 7 +++--- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index 1b1c68fe053f3..d852fcceecb04 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -36,11 +36,13 @@ from sklearn.ensemble import RandomForestClassifier feature_names = [f'feature {i}' for i in range(X.shape[1])] -forest = RandomForestClassifier(random_state=0) -forest.fit(X_train, y_train) + +# MDI-based feature importance ("impurity") is the default +forest = RandomForestClassifier(feature_importances="impurity", + random_state=0) # %% -# Feature importance based on mean decrease in impurity +# Feature importance based on Mean Decrease in Impurity (MDI) # ----------------------------------------------------- # Feature importances are provided by the fitted attribute # `feature_importances_` and they are computed as the mean and standard @@ -54,12 +56,13 @@ import numpy as np start_time = time.time() +forest.fit(X_train, y_train) importances = forest.feature_importances_ std = np.std([ tree.feature_importances_ for tree in forest.estimators_], axis=0) elapsed_time = time.time() - start_time -print(f"Elapsed time to compute the importances: " +print(f"Elapsed time to train and compute the importances: " f"{elapsed_time:.3f} seconds") # %% @@ -76,7 +79,7 @@ # %% # We observe that, as expected, the three first features are found important. # -# Feature permutation importances on OOB samples +# Permutation Feature Importances on OOB samples # ---------------------------------------------- # We will an alternative to the impurity-based feature importances based on # feature permutation using the OOB samples. We fit a new random-forest where @@ -88,14 +91,14 @@ forest.fit(X_train, y_train) elapsed_time = time.time() - start_time -print(f"Elapsed time to compute the importances: " +print(f"Elapsed time to train and compute the importances: " f"{elapsed_time:.3f} seconds") forest_importances = pd.Series(forest.feature_importances_, index=feature_names) # %% -# The permutation importances is more computationally costly. Indeed, it +# The permutation importance is more computationally costly. Indeed, it # requires to fit the tree and to make additional processing: each tree will # be evaluated on its OOB sample as well as an OOB sample where features will # be permuted. This step is costly and explains the time fitting difference @@ -116,7 +119,7 @@ # # Another difference between the two feature importances is the scale of the # reported values: -# - the permutation feature importances are not normalized and simply -# correspond to a difference of scores; +# - the permutation feature importances are not normalized and simply +# correspond to a difference of scores; # - the impurity-based feature importances reported are normalized so that -# they sum to 1. +# they sum to 1. diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index a2e4af3b0d727..6f26bbdf582f5 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -81,7 +81,8 @@ rf = Pipeline([ ('preprocess', preprocessing), - ('classifier', RandomForestClassifier(random_state=42)) + ('classifier', RandomForestClassifier(feature_importances="impurity", + random_state=42)) ]) rf.fit(X_train, y_train) @@ -168,7 +169,7 @@ # Alternative to MDI using Permutation Feature Importance # ------------------------------------------------------- # The limitations of MDI pointed out in the previous section can be mitigated -# using an alternative strategy: Feature Permutation Importance. +# using an alternative strategy: Permutation Feature Importance. # # This strategy relies on monitoring the decrease of a given performance # metric by randomly permuting the value of a given feature. In short, a @@ -222,7 +223,7 @@ # However, we still observe that these two anti-correlated features are # prone to a high standard deviation. # -# Feature Permutation Importance on train-test sets +# Permutation Feature Importance on train-test sets # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # In the previous section, we show how one can leverage the OOB samples to # compute the permutation importance. However, this is also possible to use From a47784f3a2baf2161fefb2546127ad3369c87db5 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 17 Jun 2021 15:48:26 -0400 Subject: [PATCH 48/53] MAINT Adds target_version to black config (#20293) --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 613d53e25d295..b312612236080 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ requires = [ [tool.black] line-length = 88 +target_version = ['py37', 'py38', 'py39'] exclude = ''' /( \.eggs # exclude a few common directories in the From 3695ef2f61b2527c057ce2fe603b2c81178ec1fe Mon Sep 17 00:00:00 2001 From: robert-robison Date: Mon, 21 Jun 2021 21:26:41 -0400 Subject: [PATCH 49/53] black formatted changes --- examples/ensemble/plot_forest_importances.py | 41 +- .../inspection/plot_permutation_importance.py | 99 +- sklearn/ensemble/_forest.py | 701 +++++++++------ sklearn/ensemble/tests/test_forest.py | 845 ++++++++++-------- 4 files changed, 975 insertions(+), 711 deletions(-) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index d852fcceecb04..cd09008bfd0af 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -26,20 +26,25 @@ from sklearn.model_selection import train_test_split X, y = make_classification( - n_samples=1000, n_features=10, n_informative=3, n_redundant=0, - n_repeated=0, n_classes=2, random_state=0, shuffle=False) -X_train, X_test, y_train, y_test = train_test_split( - X, y, stratify=y, random_state=42) + n_samples=1000, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + n_classes=2, + random_state=0, + shuffle=False, +) +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) # %% # A random forest classifier will be fitted to compute the feature importances. from sklearn.ensemble import RandomForestClassifier -feature_names = [f'feature {i}' for i in range(X.shape[1])] +feature_names = [f"feature {i}" for i in range(X.shape[1])] # MDI-based feature importance ("impurity") is the default -forest = RandomForestClassifier(feature_importances="impurity", - random_state=0) +forest = RandomForestClassifier(feature_importances="impurity", random_state=0) # %% # Feature importance based on Mean Decrease in Impurity (MDI) @@ -58,16 +63,17 @@ start_time = time.time() forest.fit(X_train, y_train) importances = forest.feature_importances_ -std = np.std([ - tree.feature_importances_ for tree in forest.estimators_], axis=0) +std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) elapsed_time = time.time() - start_time -print(f"Elapsed time to train and compute the importances: " - f"{elapsed_time:.3f} seconds") +print( + f"Elapsed time to train and compute the importances: " f"{elapsed_time:.3f} seconds" +) # %% # Let's plot the impurity-based importance. import pandas as pd + forest_importances = pd.Series(importances, index=feature_names) fig, ax = plt.subplots() @@ -84,18 +90,17 @@ # We will an alternative to the impurity-based feature importances based on # feature permutation using the OOB samples. We fit a new random-forest where # we explicitely specify to compute the permutation feature importances on OOB. -feature_names = [f'feature {i}' for i in range(X.shape[1])] -forest = RandomForestClassifier(feature_importances="permutation_oob", - random_state=0) +feature_names = [f"feature {i}" for i in range(X.shape[1])] +forest = RandomForestClassifier(feature_importances="permutation_oob", random_state=0) start_time = time.time() forest.fit(X_train, y_train) elapsed_time = time.time() - start_time -print(f"Elapsed time to train and compute the importances: " - f"{elapsed_time:.3f} seconds") +print( + f"Elapsed time to train and compute the importances: " f"{elapsed_time:.3f} seconds" +) -forest_importances = pd.Series(forest.feature_importances_, - index=feature_names) +forest_importances = pd.Series(forest.feature_importances_, index=feature_names) # %% # The permutation importance is more computationally costly. Indeed, it diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py index 6f26bbdf582f5..74e6e44aa2369 100644 --- a/examples/inspection/plot_permutation_importance.py +++ b/examples/inspection/plot_permutation_importance.py @@ -28,6 +28,7 @@ print(__doc__) import matplotlib.pyplot as plt import sklearn + sklearn.set_config(display="diagram") # %% @@ -48,18 +49,18 @@ X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) rng = np.random.RandomState(seed=42) -X['random_cat'] = rng.randint(3, size=X.shape[0]) -X['random_num'] = rng.randn(X.shape[0]) +X["random_cat"] = rng.randint(3, size=X.shape[0]) +X["random_num"] = rng.randn(X.shape[0]) # %% from sklearn.model_selection import train_test_split -categorical_columns = ['pclass', 'sex', 'embarked', 'random_cat'] -numerical_columns = ['age', 'sibsp', 'parch', 'fare', 'random_num'] + +categorical_columns = ["pclass", "sex", "embarked", "random_cat"] +numerical_columns = ["age", "sibsp", "parch", "fare", "random_num"] X = X[categorical_columns + numerical_columns] -X_train, X_test, y_train, y_test = train_test_split( - X, y, stratify=y, random_state=42) +X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42) # %% # The following shows how to apply separate preprocessing on numerical and @@ -70,20 +71,25 @@ from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder -categorical_encoder = OneHotEncoder(handle_unknown='ignore') -numerical_pipe = Pipeline([ - ('imputer', SimpleImputer(strategy='mean')) -]) +categorical_encoder = OneHotEncoder(handle_unknown="ignore") +numerical_pipe = Pipeline([("imputer", SimpleImputer(strategy="mean"))]) preprocessing = ColumnTransformer( - [('cat', categorical_encoder, categorical_columns), - ('num', numerical_pipe, numerical_columns)]) + [ + ("cat", categorical_encoder, categorical_columns), + ("num", numerical_pipe, numerical_columns), + ] +) -rf = Pipeline([ - ('preprocess', preprocessing), - ('classifier', RandomForestClassifier(feature_importances="impurity", - random_state=42)) -]) +rf = Pipeline( + [ + ("preprocess", preprocessing), + ( + "classifier", + RandomForestClassifier(feature_importances="impurity", random_state=42), + ), + ] +) rf.fit(X_train, y_train) # %% @@ -120,18 +126,17 @@ # importances across trees. import pandas as pd -ohe = (rf.named_steps['preprocess'] - .named_transformers_['cat']) +ohe = rf.named_steps["preprocess"].named_transformers_["cat"] feature_names = ohe.get_feature_names(input_features=categorical_columns) feature_names = np.r_[feature_names, numerical_columns] tree_feature_importances = pd.DataFrame( - rf.named_steps['classifier'].importances_.importances.T, - columns=feature_names) + rf.named_steps["classifier"].importances_.importances.T, columns=feature_names +) # sort (reorder columns) the DataFrame for the plotting tree_feature_importances = tree_feature_importances.reindex( - tree_feature_importances.mean().sort_values().index, - axis="columns") + tree_feature_importances.mean().sort_values().index, axis="columns" +) ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Random Forest Feature Importances (MDI)") @@ -186,11 +191,17 @@ # Random-forest exposes a parameter `feature_importances` that allows to switch # from the MDI to the permutation importance on the OOB samples. The parameter # need to be set to `"permutation_oob"`. -rf = Pipeline(steps=[ - ("preprocess", preprocessing), - ("classifier", RandomForestClassifier( - feature_importances="permutation_oob", random_state=42)) -]).fit(X_train, y_train) +rf = Pipeline( + steps=[ + ("preprocess", preprocessing), + ( + "classifier", + RandomForestClassifier( + feature_importances="permutation_oob", random_state=42 + ), + ), + ] +).fit(X_train, y_train) # %% # Once the forest has been trained, the permutation importances will be @@ -201,12 +212,12 @@ # Thus, we can plot those feature importances and compared them with # the MDI estimates. tree_feature_importances = pd.DataFrame( - rf.named_steps['classifier'].importances_.importances.T, - columns=feature_names) + rf.named_steps["classifier"].importances_.importances.T, columns=feature_names +) # sort (reorder columns) the DataFrame for the plotting tree_feature_importances = tree_feature_importances.reindex( - tree_feature_importances.mean().sort_values().index, - axis="columns") + tree_feature_importances.mean().sort_values().index, axis="columns" +) ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Random Forest Feature Importances (OOB Permutation)") @@ -235,14 +246,14 @@ # rely on the forest models. from sklearn.inspection import permutation_importance -result = permutation_importance(rf, X_test, y_test, n_repeats=10, - random_state=42, n_jobs=2) -tree_feature_importances = pd.DataFrame( - result.importances.T, columns=X_test.columns) +result = permutation_importance( + rf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=2 +) +tree_feature_importances = pd.DataFrame(result.importances.T, columns=X_test.columns) # sort (reorder columns) the DataFrame for the plotting tree_feature_importances = tree_feature_importances.reindex( - tree_feature_importances.mean().sort_values().index, - axis="columns") + tree_feature_importances.mean().sort_values().index, axis="columns" +) ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Permutation Importances (test set)") @@ -261,14 +272,14 @@ # plots is a confirmation that the RF model has enough capacity to use that # random numerical feature to overfit. You can further confirm this by # re-running this example with constrained RF with `min_samples_leaf=10`. -result = permutation_importance(rf, X_train, y_train, n_repeats=10, - random_state=42, n_jobs=2) -tree_feature_importances = pd.DataFrame( - result.importances.T, columns=X_test.columns) +result = permutation_importance( + rf, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2 +) +tree_feature_importances = pd.DataFrame(result.importances.T, columns=X_test.columns) # sort (reorder columns) the DataFrame for the plotting tree_feature_importances = tree_feature_importances.reindex( - tree_feature_importances.mean().sort_values().index, - axis="columns") + tree_feature_importances.mean().sort_values().index, axis="columns" +) ax = tree_feature_importances.plot.box(vert=False) ax.set_title("Permutation Importances (train set)") diff --git a/sklearn/ensemble/_forest.py b/sklearn/ensemble/_forest.py index f5fc1a9b6cbed..81a095f421534 100644 --- a/sklearn/ensemble/_forest.py +++ b/sklearn/ensemble/_forest.py @@ -55,8 +55,12 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..base import ClassifierMixin, RegressorMixin, MultiOutputMixin from ..metrics import accuracy_score, r2_score from ..preprocessing import OneHotEncoder -from ..tree import (DecisionTreeClassifier, DecisionTreeRegressor, - ExtraTreeClassifier, ExtraTreeRegressor) +from ..tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, + ExtraTreeClassifier, + ExtraTreeRegressor, +) from ..tree._tree import DTYPE, DOUBLE from ..utils import check_random_state, check_array, compute_sample_weight from ..utils import Bunch, deprecated @@ -68,11 +72,13 @@ class calls the ``fit`` method of each sub-estimator on random samples from ..utils.validation import check_is_fitted, _check_sample_weight -__all__ = ["RandomForestClassifier", - "RandomForestRegressor", - "ExtraTreesClassifier", - "ExtraTreesRegressor", - "RandomTreesEmbedding"] +__all__ = [ + "RandomForestClassifier", + "RandomForestRegressor", + "ExtraTreesClassifier", + "ExtraTreesRegressor", + "RandomTreesEmbedding", +] MAX_INT = np.iinfo(np.int32).max @@ -132,8 +138,9 @@ def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap): bootstrap. This operation can be neglected in terms of computation time compared to other processes when it is used (e.g. scoring). """ - sample_indices = _generate_sample_indices(random_state, n_samples, - n_samples_bootstrap) + sample_indices = _generate_sample_indices( + random_state, n_samples, n_samples_bootstrap + ) sample_counts = np.bincount(sample_indices, minlength=n_samples) unsampled_mask = sample_counts == 0 indices_range = np.arange(n_samples) @@ -142,9 +149,18 @@ def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap): return unsampled_indices -def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, - verbose=0, class_weight=None, - n_samples_bootstrap=None): +def _parallel_build_trees( + tree, + forest, + X, + y, + sample_weight, + tree_idx, + n_trees, + verbose=0, + class_weight=None, + n_samples_bootstrap=None, +): """Private function used to fit a single tree in parallel.""" if verbose > 1: print("building tree %d of %d" % (tree_idx + 1, n_trees)) @@ -156,19 +172,18 @@ def _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, else: curr_sample_weight = sample_weight.copy() - indices = _generate_sample_indices(tree.random_state, n_samples, - n_samples_bootstrap) + indices = _generate_sample_indices( + tree.random_state, n_samples, n_samples_bootstrap + ) sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts - if class_weight == 'subsample': + if class_weight == "subsample": with catch_warnings(): - simplefilter('ignore', DeprecationWarning) - curr_sample_weight *= compute_sample_weight('auto', y, - indices=indices) - elif class_weight == 'balanced_subsample': - curr_sample_weight *= compute_sample_weight('balanced', y, - indices=indices) + simplefilter("ignore", DeprecationWarning) + curr_sample_weight *= compute_sample_weight("auto", y, indices=indices) + elif class_weight == "balanced_subsample": + curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices) tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) else: @@ -220,24 +235,27 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, - base_estimator, - n_estimators=100, - *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - max_samples=None, - feature_importances="impurity"): + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, - estimator_params=estimator_params) + estimator_params=estimator_params, + ) self.bootstrap = bootstrap self.oob_score = oob_score @@ -267,10 +285,11 @@ def apply(self, X): return the index of the leaf x ends up in. """ X = self._validate_X_predict(X) - results = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, - **_joblib_parallel_args(prefer="threads"))( - delayed(tree.apply)(X, check_input=False) - for tree in self.estimators_) + results = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + **_joblib_parallel_args(prefer="threads"), + )(delayed(tree.apply)(X, check_input=False) for tree in self.estimators_) return np.array(results).T @@ -300,10 +319,14 @@ def decision_path(self, X): """ X = self._validate_X_predict(X) - indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, - **_joblib_parallel_args(prefer='threads'))( + indicators = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + **_joblib_parallel_args(prefer="threads"), + )( delayed(tree.decision_path)(X, check_input=False) - for tree in self.estimators_) + for tree in self.estimators_ + ) n_nodes = [0] n_nodes.extend([i.shape[1] for i in indicators]) @@ -339,11 +362,10 @@ def fit(self, X, y, sample_weight=None): """ # Validate or convert input data if issparse(y): - raise ValueError( - "sparse multilabel-indicator for y is not supported." - ) - X, y = self._validate_data(X, y, multi_output=True, - accept_sparse="csc", dtype=DTYPE) + raise ValueError("sparse multilabel-indicator for y is not supported.") + X, y = self._validate_data( + X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE + ) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X) @@ -354,10 +376,13 @@ def fit(self, X, y, sample_weight=None): y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: - warn("A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples,), for example using ravel().", - DataConversionWarning, stacklevel=2) + warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs @@ -366,11 +391,15 @@ def fit(self, X, y, sample_weight=None): if self.criterion == "poisson": if np.any(y < 0): - raise ValueError("Some value(s) of y are negative which is " - "not allowed for Poisson regression.") + raise ValueError( + "Some value(s) of y are negative which is " + "not allowed for Poisson regression." + ) if np.sum(y) <= 0: - raise ValueError("Sum of y is not strictly positive which " - "is necessary for Poisson regression.") + raise ValueError( + "Sum of y is not strictly positive which " + "is necessary for Poisson regression." + ) self.n_outputs_ = y.shape[1] @@ -387,8 +416,7 @@ def fit(self, X, y, sample_weight=None): # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples=X.shape[0], - max_samples=self.max_samples + n_samples=X.shape[0], max_samples=self.max_samples ) # Check parameters @@ -400,14 +428,14 @@ def fit(self, X, y, sample_weight=None): "Criterion 'mse' was deprecated in v1.0 and will be " "removed in version 1.2. Use `criterion='squared_error'` " "which is equivalent.", - FutureWarning + FutureWarning, ) elif self.criterion == "mae": warn( "Criterion 'mae' was deprecated in v1.0 and will be " "removed in version 1.2. Use `criterion='absolute_error'` " "which is equivalent.", - FutureWarning + FutureWarning, ) if self.feature_importances not in ("impurity", "permutation_oob"): @@ -417,13 +445,8 @@ def fit(self, X, y, sample_weight=None): ) if not self.bootstrap and self.oob_score: - raise ValueError( - "Out of bag estimation only available if bootstrap=True" - ) - if ( - not self.bootstrap - and self.feature_importances == "permutation_oob" - ): + raise ValueError("Out of bag estimation only available if bootstrap=True") + if not self.bootstrap and self.feature_importances == "permutation_oob": raise ValueError( "Estimating feature importance on out of bag samples only " "available if bootstrap=True" @@ -438,22 +461,27 @@ def fit(self, X, y, sample_weight=None): n_more_estimators = self.n_estimators - len(self.estimators_) if n_more_estimators < 0: - raise ValueError('n_estimators=%d must be larger or equal to ' - 'len(estimators_)=%d when warm_start==True' - % (self.n_estimators, len(self.estimators_))) + raise ValueError( + "n_estimators=%d must be larger or equal to " + "len(estimators_)=%d when warm_start==True" + % (self.n_estimators, len(self.estimators_)) + ) elif n_more_estimators == 0: - warn("Warm-start fitting without increasing n_estimators does not " - "fit new trees.") + warn( + "Warm-start fitting without increasing n_estimators does not " + "fit new trees." + ) else: if self.warm_start and len(self.estimators_) > 0: # We draw from the random state to get the random state we # would have got if we hadn't used a warm_start. random_state.randint(MAX_INT, size=len(self.estimators_)) - trees = [self._make_estimator(append=False, - random_state=random_state) - for i in range(n_more_estimators)] + trees = [ + self._make_estimator(append=False, random_state=random_state) + for i in range(n_more_estimators) + ] # Parallel loop: we prefer the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL @@ -461,13 +489,25 @@ def fit(self, X, y, sample_weight=None): # that case. However, for joblib 0.12+ we respect any # parallel_backend contexts set at a higher level, # since correctness does not rely on using threads. - trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, - **_joblib_parallel_args(prefer='threads'))( + trees = Parallel( + n_jobs=self.n_jobs, + verbose=self.verbose, + **_joblib_parallel_args(prefer="threads"), + )( delayed(_parallel_build_trees)( - t, self, X, y, sample_weight, i, len(trees), - verbose=self.verbose, class_weight=self.class_weight, - n_samples_bootstrap=n_samples_bootstrap) - for i, t in enumerate(trees)) + t, + self, + X, + y, + sample_weight, + i, + len(trees), + verbose=self.verbose, + class_weight=self.class_weight, + n_samples_bootstrap=n_samples_bootstrap, + ) + for i, t in enumerate(trees) + ) # Collect newly grown trees self.estimators_.extend(trees) @@ -516,16 +556,15 @@ def _compute_impurity_importances(self): """ parallel_args = { **_joblib_parallel_args(prefer="threads"), - "n_jobs": self.n_jobs + "n_jobs": self.n_jobs, } all_importances = Parallel(**parallel_args)( - delayed(getattr)(tree, 'feature_importances_') - for tree in self.estimators_ if tree.tree_.node_count > 1 + delayed(getattr)(tree, "feature_importances_") + for tree in self.estimators_ + if tree.tree_.node_count > 1 ) if not all_importances: - return np.zeros( - shape=(self.n_features_in_, 1), dtype=np.float64 - ) + return np.zeros(shape=(self.n_features_in_, 1), dtype=np.float64) return np.transpose(all_importances) @abstractmethod @@ -575,18 +614,20 @@ def _compute_oob_importances(self, X, y, sample_weight): with config_context(assume_finite=True): # avoid redundant checking performed on X in the permutation # importance function. - oob_importances = np.transpose(Parallel(n_jobs=self.n_jobs)( - delayed(_permutation_importances_oob)( - estimator, - X, - y, - sample_weight, - n_samples, - n_samples_bootstrap, - random_state, + oob_importances = np.transpose( + Parallel(n_jobs=self.n_jobs)( + delayed(_permutation_importances_oob)( + estimator, + X, + y, + sample_weight, + n_samples, + n_samples_bootstrap, + random_state, + ) + for estimator in self.estimators_ ) - for estimator in self.estimators_ - )) + ) return oob_importances @@ -607,8 +648,7 @@ def _compute_oob_predictions(self, X): (n_samples, 1, n_outputs) The OOB predictions. """ - X = self._validate_data(X, dtype=DTYPE, accept_sparse='csr', - reset=False) + X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False) n_samples = X.shape[0] n_outputs = self.n_outputs_ @@ -627,11 +667,14 @@ def _compute_oob_predictions(self, X): n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64) n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples, self.max_samples, + n_samples, + self.max_samples, ) for idx, estimator in enumerate(self.estimators_): unsampled_indices = _generate_unsampled_indices( - estimator.random_state, n_samples, n_samples_bootstrap, + estimator.random_state, + n_samples, + n_samples_bootstrap, ) X_oob = X[unsampled_indices, :] @@ -644,7 +687,8 @@ def _compute_oob_predictions(self, X): warn( "Some inputs do not have OOB scores. This probably means " "too few trees were used to compute any reliable OOB " - "estimates.", UserWarning + "estimates.", + UserWarning, ) n_oob_pred[n_oob_pred == 0] = 1 oob_pred[..., k] /= n_oob_pred[..., [k]] @@ -738,20 +782,22 @@ class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, - base_estimator, - n_estimators=100, - *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - max_samples=None, - feature_importances="impurity"): + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator, n_estimators=n_estimators, @@ -816,8 +862,9 @@ def _set_oob_score_and_attributes(self, X, y, sample_weight): self.oob_decision_function_ = oob_predictions self.oob_score_ = accuracy_score( - y, np.argmax(self.oob_decision_function_, axis=1), - sample_weight=sample_weight + y, + np.argmax(self.oob_decision_function_, axis=1), + sample_weight=sample_weight, ) def _validate_y_class_weight(self, y): @@ -834,40 +881,42 @@ def _validate_y_class_weight(self, y): y_store_unique_indices = np.zeros(y.shape, dtype=int) for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = \ - np.unique(y[:, k], return_inverse=True) + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_store_unique_indices if self.class_weight is not None: - valid_presets = ('balanced', 'balanced_subsample') + valid_presets = ("balanced", "balanced_subsample") if isinstance(self.class_weight, str): if self.class_weight not in valid_presets: - raise ValueError('Valid presets for class_weight include ' - '"balanced" and "balanced_subsample".' - 'Given "%s".' - % self.class_weight) + raise ValueError( + "Valid presets for class_weight include " + '"balanced" and "balanced_subsample".' + 'Given "%s".' % self.class_weight + ) if self.warm_start: - warn('class_weight presets "balanced" or ' - '"balanced_subsample" are ' - 'not recommended for warm_start if the fitted data ' - 'differs from the full dataset. In order to use ' - '"balanced" weights, use compute_class_weight ' - '("balanced", classes, y). In place of y you can use ' - 'a large enough sample of the full training set ' - 'target to properly estimate the class frequency ' - 'distributions. Pass the resulting weights as the ' - 'class_weight parameter.') - - if (self.class_weight != 'balanced_subsample' or - not self.bootstrap): + warn( + 'class_weight presets "balanced" or ' + '"balanced_subsample" are ' + "not recommended for warm_start if the fitted data " + "differs from the full dataset. In order to use " + '"balanced" weights, use compute_class_weight ' + '("balanced", classes, y). In place of y you can use ' + "a large enough sample of the full training set " + "target to properly estimate the class frequency " + "distributions. Pass the resulting weights as the " + "class_weight parameter." + ) + + if self.class_weight != "balanced_subsample" or not self.bootstrap: if self.class_weight == "balanced_subsample": class_weight = "balanced" else: class_weight = self.class_weight - expanded_class_weight = compute_sample_weight(class_weight, - y_original) + expanded_class_weight = compute_sample_weight(class_weight, y_original) return y, expanded_class_weight @@ -901,13 +950,12 @@ def predict(self, X): n_samples = proba[0].shape[0] # all dtypes should be the same, so just take the first class_type = self.classes_[0].dtype - predictions = np.empty((n_samples, self.n_outputs_), - dtype=class_type) + predictions = np.empty((n_samples, self.n_outputs_), dtype=class_type) for k in range(self.n_outputs_): - predictions[:, k] = self.classes_[k].take(np.argmax(proba[k], - axis=1), - axis=0) + predictions[:, k] = self.classes_[k].take( + np.argmax(proba[k], axis=1), axis=0 + ) return predictions @@ -941,14 +989,19 @@ def predict_proba(self, X): n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # avoid storing the output of every estimator by summing them here - all_proba = [np.zeros((X.shape[0], j), dtype=np.float64) - for j in np.atleast_1d(self.n_classes_)] + all_proba = [ + np.zeros((X.shape[0], j), dtype=np.float64) + for j in np.atleast_1d(self.n_classes_) + ] lock = threading.Lock() - Parallel(n_jobs=n_jobs, verbose=self.verbose, - **_joblib_parallel_args(require="sharedmem"))( - delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, - lock) - for e in self.estimators_) + Parallel( + n_jobs=n_jobs, + verbose=self.verbose, + **_joblib_parallel_args(require="sharedmem"), + )( + delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, lock) + for e in self.estimators_ + ) for proba in all_proba: proba /= len(self.estimators_) @@ -1000,19 +1053,21 @@ class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, - base_estimator, - n_estimators=100, - *, - estimator_params=tuple(), - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - max_samples=None, - feature_importances="impurity"): + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator, n_estimators=n_estimators, @@ -1061,10 +1116,14 @@ def predict(self, X): # Parallel loop lock = threading.Lock() - Parallel(n_jobs=n_jobs, verbose=self.verbose, - **_joblib_parallel_args(require="sharedmem"))( + Parallel( + n_jobs=n_jobs, + verbose=self.verbose, + **_joblib_parallel_args(require="sharedmem"), + )( delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock) - for e in self.estimators_) + for e in self.estimators_ + ) y_hat /= len(self.estimators_) @@ -1112,9 +1171,7 @@ def _set_oob_score_and_attributes(self, X, y, sample_weight): # drop the n_outputs axis if there is a single output oob_predictions = oob_predictions.squeeze(axis=-1) self.oob_prediction_ = oob_predictions - self.oob_score_ = r2_score( - y, self.oob_prediction_, sample_weight=sample_weight - ) + self.oob_score_ = r2_score(y, self.oob_prediction_, sample_weight=sample_weight) def _compute_partial_dependence_recursion(self, grid, target_features): """Fast partial dependence computation. @@ -1133,15 +1190,17 @@ def _compute_partial_dependence_recursion(self, grid, target_features): averaged_predictions : ndarray of shape (n_samples,) The value of the partial dependence function on each grid point. """ - grid = np.asarray(grid, dtype=DTYPE, order='C') - averaged_predictions = np.zeros(shape=grid.shape[0], - dtype=np.float64, order='C') + grid = np.asarray(grid, dtype=DTYPE, order="C") + averaged_predictions = np.zeros( + shape=grid.shape[0], dtype=np.float64, order="C" + ) for tree in self.estimators_: # Note: we don't sum in parallel because the GIL isn't released in # the fast method. tree.tree_.compute_partial_dependence( - grid, target_features, averaged_predictions) + grid, target_features, averaged_predictions + ) # Average over the forest averaged_predictions /= len(self.estimators_) @@ -1449,35 +1508,45 @@ class labels (multi-output problem). >>> print(clf.predict([[0, 0, 0, 0]])) [1] """ - def __init__(self, - n_estimators=100, - *, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - feature_importances="impurity"): + + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", - "min_impurity_decrease", "random_state", - "ccp_alpha"), + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + ), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -1789,34 +1858,43 @@ class RandomForestRegressor(ForestRegressor): [-8.32987858] """ - def __init__(self, - n_estimators=100, - *, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=True, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - feature_importances="impurity"): + def __init__( + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=True, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=DecisionTreeRegressor(), n_estimators=n_estimators, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", - "min_impurity_decrease", "random_state", - "ccp_alpha"), + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + ), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -2135,35 +2213,44 @@ class labels (multi-output problem). array([1]) """ - def __init__(self, - n_estimators=100, - *, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - class_weight=None, - ccp_alpha=0.0, - max_samples=None, - feature_importances="impurity"): + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + class_weight=None, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", - "min_impurity_decrease", "random_state", - "ccp_alpha"), + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + ), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -2460,34 +2547,43 @@ class ExtraTreesRegressor(ForestRegressor): 0.2708... """ - def __init__(self, - n_estimators=100, - *, - criterion="squared_error", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0.0, - max_features="auto", - max_leaf_nodes=None, - min_impurity_decrease=0., - bootstrap=False, - oob_score=False, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False, - ccp_alpha=0.0, - max_samples=None, - feature_importances="impurity"): + def __init__( + self, + n_estimators=100, + *, + criterion="squared_error", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + max_leaf_nodes=None, + min_impurity_decrease=0.0, + bootstrap=False, + oob_score=False, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ccp_alpha=0.0, + max_samples=None, + feature_importances="impurity", + ): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", - "min_impurity_decrease", "random_state", - "ccp_alpha"), + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + "ccp_alpha", + ), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, @@ -2683,33 +2779,44 @@ class RandomTreesEmbedding(BaseForest): criterion = "squared_error" max_features = 1 - def __init__(self, - n_estimators=100, *, - max_depth=5, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_leaf_nodes=None, - min_impurity_decrease=0., - sparse_output=True, - n_jobs=None, - random_state=None, - verbose=0, - warm_start=False): + def __init__( + self, + n_estimators=100, + *, + max_depth=5, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + sparse_output=True, + n_jobs=None, + random_state=None, + verbose=0, + warm_start=False, + ): super().__init__( base_estimator=ExtraTreeRegressor(), n_estimators=n_estimators, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "max_leaf_nodes", - "min_impurity_decrease", "random_state"), + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "max_leaf_nodes", + "min_impurity_decrease", + "random_state", + ), bootstrap=False, oob_score=False, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, - max_samples=None) + max_samples=None, + ) self.max_depth = max_depth self.min_samples_split = min_samples_split @@ -2776,7 +2883,7 @@ def fit_transform(self, X, y=None, sample_weight=None): X_transformed : sparse matrix of shape (n_samples, n_out) Transformed dataset. """ - X = self._validate_data(X, accept_sparse=['csc']) + X = self._validate_data(X, accept_sparse=["csc"]) if issparse(X): # Pre-sort indices to avoid that each individual tree of the # ensemble sorts the indices. diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 155ec6f80a8f4..5f8d197c90c5f 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -68,8 +68,14 @@ # Larger classification sample used for testing feature importances X_large, y_large = datasets.make_classification( - n_samples=500, n_features=10, n_informative=3, n_redundant=0, - n_repeated=0, shuffle=False, random_state=0) + n_samples=500, + n_features=10, + n_informative=3, + n_redundant=0, + n_repeated=0, + shuffle=False, + random_state=0, +) # also load the iris dataset # and randomly permute it @@ -80,8 +86,7 @@ iris.target = iris.target[perm] # Make regression dataset -X_reg, y_reg = datasets.make_regression(n_samples=500, n_features=10, - random_state=1) +X_reg, y_reg = datasets.make_regression(n_samples=500, n_features=10, random_state=1) # also make a hastie_10_2 dataset hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1) @@ -133,7 +138,7 @@ def check_classification_toy(name): assert leaf_indices.shape == (len(X), clf.n_estimators) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_classification_toy(name): check_classification_toy(name) @@ -142,23 +147,21 @@ def check_iris_criterion(name, criterion): # Check consistency on dataset iris. ForestClassifier = FOREST_CLASSIFIERS[name] - clf = ForestClassifier(n_estimators=10, criterion=criterion, - random_state=1) + clf = ForestClassifier(n_estimators=10, criterion=criterion, random_state=1) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) - assert score > 0.9, ("Failed with criterion %s and score = %f" - % (criterion, score)) + assert score > 0.9, "Failed with criterion %s and score = %f" % (criterion, score) - clf = ForestClassifier(n_estimators=10, criterion=criterion, - max_features=2, random_state=1) + clf = ForestClassifier( + n_estimators=10, criterion=criterion, max_features=2, random_state=1 + ) clf.fit(iris.data, iris.target) score = clf.score(iris.data, iris.target) - assert score > 0.5, ("Failed with criterion %s and score = %f" - % (criterion, score)) + assert score > 0.5, "Failed with criterion %s and score = %f" % (criterion, score) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) -@pytest.mark.parametrize('criterion', ("gini", "entropy")) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) +@pytest.mark.parametrize("criterion", ("gini", "entropy")) def test_iris(name, criterion): check_iris_criterion(name, criterion) @@ -167,25 +170,30 @@ def check_regression_criterion(name, criterion): # Check consistency on regression dataset. ForestRegressor = FOREST_REGRESSORS[name] - reg = ForestRegressor(n_estimators=5, criterion=criterion, - random_state=1) + reg = ForestRegressor(n_estimators=5, criterion=criterion, random_state=1) reg.fit(X_reg, y_reg) score = reg.score(X_reg, y_reg) - assert score > 0.93, ("Failed with max_features=None, criterion %s " - "and score = %f" % (criterion, score)) + assert ( + score > 0.93 + ), "Failed with max_features=None, criterion %s " "and score = %f" % ( + criterion, + score, + ) - reg = ForestRegressor(n_estimators=5, criterion=criterion, - max_features=6, random_state=1) + reg = ForestRegressor( + n_estimators=5, criterion=criterion, max_features=6, random_state=1 + ) reg.fit(X_reg, y_reg) score = reg.score(X_reg, y_reg) - assert score > 0.92, ("Failed with max_features=6, criterion %s " - "and score = %f" % (criterion, score)) + assert ( + score > 0.92 + ), "Failed with max_features=6, criterion %s " "and score = %f" % (criterion, score) -@pytest.mark.parametrize('name', FOREST_REGRESSORS) -@pytest.mark.parametrize('criterion', ( - "squared_error", "absolute_error", "friedman_mse" -)) +@pytest.mark.parametrize("name", FOREST_REGRESSORS) +@pytest.mark.parametrize( + "criterion", ("squared_error", "absolute_error", "friedman_mse") +) def test_regression(name, criterion): check_regression_criterion(name, criterion) @@ -195,26 +203,27 @@ def test_poisson_vs_mse(): mse for a poisson target.""" rng = np.random.RandomState(42) n_train, n_test, n_features = 500, 500, 10 - X = datasets.make_low_rank_matrix(n_samples=n_train + n_test, - n_features=n_features, random_state=rng) + X = datasets.make_low_rank_matrix( + n_samples=n_train + n_test, n_features=n_features, random_state=rng + ) X = np.abs(X) X /= np.max(np.abs(X), axis=0) # We create a log-linear Poisson model coef = rng.uniform(low=-4, high=1, size=n_features) y = rng.poisson(lam=np.exp(X @ coef)) - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=n_test, - random_state=rng) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=n_test, random_state=rng + ) forest_poi = RandomForestRegressor( - criterion="poisson", - min_samples_leaf=10, - max_features="sqrt", - random_state=rng) + criterion="poisson", min_samples_leaf=10, max_features="sqrt", random_state=rng + ) forest_mse = RandomForestRegressor( criterion="squared_error", min_samples_leaf=10, max_features="sqrt", - random_state=rng) + random_state=rng, + ) forest_poi.fit(X_train, y_train) forest_mse.fit(X_train, y_train) @@ -229,8 +238,8 @@ def test_poisson_vs_mse(): # not clip to a tiny value like 1e-15, but to 0.1. This acts like a # mild penalty to the non-positive predictions. metric_mse = mean_poisson_deviance( - y, - np.clip(forest_mse.predict(X), 1e-6, None)) + y, np.clip(forest_mse.predict(X), 1e-6, None) + ) metric_dummy = mean_poisson_deviance(y, dummy.predict(X)) # As squared_error might correctly predict 0 in train set, its train # score can be better than Poisson. This is no longer the case for the @@ -240,21 +249,21 @@ def test_poisson_vs_mse(): assert metric_poi < metric_dummy -@pytest.mark.parametrize('criterion', ('poisson', 'squared_error')) +@pytest.mark.parametrize("criterion", ("poisson", "squared_error")) def test_balance_property_random_forest(criterion): - """"Test that sum(y_pred)==sum(y_true) on the training set.""" + """ "Test that sum(y_pred)==sum(y_true) on the training set.""" rng = np.random.RandomState(42) n_train, n_test, n_features = 500, 500, 10 - X = datasets.make_low_rank_matrix(n_samples=n_train + n_test, - n_features=n_features, random_state=rng) + X = datasets.make_low_rank_matrix( + n_samples=n_train + n_test, n_features=n_features, random_state=rng + ) coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0) y = rng.poisson(lam=np.exp(X @ coef)) - reg = RandomForestRegressor(criterion=criterion, - n_estimators=10, - bootstrap=False, - random_state=rng) + reg = RandomForestRegressor( + criterion=criterion, n_estimators=10, bootstrap=False, random_state=rng + ) reg.fit(X, y) assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y)) @@ -271,7 +280,7 @@ def check_regressor_attributes(name): assert not hasattr(r, "n_classes_") -@pytest.mark.parametrize('name', FOREST_REGRESSORS) +@pytest.mark.parametrize("name", FOREST_REGRESSORS) def test_regressor_attributes(name): check_regressor_attributes(name) @@ -280,16 +289,19 @@ def check_probability(name): # Predict probabilities. ForestClassifier = FOREST_CLASSIFIERS[name] with np.errstate(divide="ignore"): - clf = ForestClassifier(n_estimators=10, random_state=1, max_features=1, - max_depth=1) + clf = ForestClassifier( + n_estimators=10, random_state=1, max_features=1, max_depth=1 + ) clf.fit(iris.data, iris.target) - assert_array_almost_equal(np.sum(clf.predict_proba(iris.data), axis=1), - np.ones(iris.data.shape[0])) - assert_array_almost_equal(clf.predict_proba(iris.data), - np.exp(clf.predict_log_proba(iris.data))) + assert_array_almost_equal( + np.sum(clf.predict_proba(iris.data), axis=1), np.ones(iris.data.shape[0]) + ) + assert_array_almost_equal( + clf.predict_proba(iris.data), np.exp(clf.predict_log_proba(iris.data)) + ) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_probability(name): check_probability(name) @@ -301,8 +313,7 @@ def check_importances(name, criterion, dtype, tolerance): ForestEstimator = FOREST_ESTIMATORS[name] - est = ForestEstimator(n_estimators=10, criterion=criterion, - random_state=0) + est = ForestEstimator(n_estimators=10, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ @@ -327,24 +338,20 @@ def check_importances(name, criterion, dtype, tolerance): assert np.all(importances >= 0.0) for scale in [0.5, 100]: - est = ForestEstimator(n_estimators=10, random_state=0, - criterion=criterion) + est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert np.abs(importances - importances_bis).mean() < tolerance -@pytest.mark.parametrize('dtype', (np.float64, np.float32)) +@pytest.mark.parametrize("dtype", (np.float64, np.float32)) @pytest.mark.parametrize( - 'name, criterion', - itertools.chain(product(FOREST_CLASSIFIERS, - ["gini", "entropy"]), - product(FOREST_REGRESSORS, - [ - "squared_error", - "friedman_mse", - "absolute_error" - ]))) + "name, criterion", + itertools.chain( + product(FOREST_CLASSIFIERS, ["gini", "entropy"]), + product(FOREST_REGRESSORS, ["squared_error", "friedman_mse", "absolute_error"]), + ), +) def test_importances(dtype, name, criterion): tolerance = 0.01 if name in FOREST_REGRESSORS and criterion == "absolute_error": @@ -362,10 +369,10 @@ def binomial(k, n): def entropy(samples): n_samples = len(samples) - entropy = 0. + entropy = 0.0 for count in np.bincount(samples): - p = 1. * count / n_samples + p = 1.0 * count / n_samples if p > 0: entropy -= p * np.log2(p) @@ -378,11 +385,11 @@ def mdi_importance(X_m, X, y): features.pop(X_m) values = [np.unique(X[:, i]) for i in range(n_features)] - imp = 0. + imp = 0.0 for k in range(n_features): # Weight of each B of size k - coef = 1. / (binomial(k, n_features) * (n_features - k)) + coef = 1.0 / (binomial(k, n_features) * (n_features - k)) # For all B of size k for B in combinations(features, k): @@ -403,24 +410,36 @@ def mdi_importance(X_m, X, y): mask_xi = X_[:, X_m] == xi children.append(y_[mask_xi]) - imp += (coef - * (1. * n_samples_b / n_samples) # P(B=b) - * (entropy(y_) - - sum([entropy(c) * len(c) / n_samples_b - for c in children]))) + imp += ( + coef + * (1.0 * n_samples_b / n_samples) # P(B=b) + * ( + entropy(y_) + - sum( + [ + entropy(c) * len(c) / n_samples_b + for c in children + ] + ) + ) + ) return imp - data = np.array([[0, 0, 1, 0, 0, 1, 0, 1], - [1, 0, 1, 1, 1, 0, 1, 2], - [1, 0, 1, 1, 0, 1, 1, 3], - [0, 1, 1, 1, 0, 1, 0, 4], - [1, 1, 0, 1, 0, 1, 1, 5], - [1, 1, 0, 1, 1, 1, 1, 6], - [1, 0, 1, 0, 0, 1, 0, 7], - [1, 1, 1, 1, 1, 1, 1, 8], - [1, 1, 1, 1, 0, 1, 1, 9], - [1, 1, 1, 0, 1, 1, 1, 0]]) + data = np.array( + [ + [0, 0, 1, 0, 0, 1, 0, 1], + [1, 0, 1, 1, 1, 0, 1, 2], + [1, 0, 1, 1, 0, 1, 1, 3], + [0, 1, 1, 1, 0, 1, 0, 4], + [1, 1, 0, 1, 0, 1, 1, 5], + [1, 1, 0, 1, 1, 1, 1, 6], + [1, 0, 1, 0, 0, 1, 0, 7], + [1, 1, 1, 1, 1, 1, 1, 8], + [1, 1, 1, 1, 0, 1, 1, 9], + [1, 1, 1, 0, 1, 1, 1, 0], + ] + ) X, y = np.array(data[:, :7], dtype=bool), data[:, 7] n_features = X.shape[1] @@ -432,37 +451,38 @@ def mdi_importance(X_m, X, y): true_importances[i] = mdi_importance(i, X, y) # Estimate importances with totally randomized trees - clf = ExtraTreesClassifier(n_estimators=500, - max_features=1, - criterion="entropy", - random_state=0).fit(X, y) + clf = ExtraTreesClassifier( + n_estimators=500, max_features=1, criterion="entropy", random_state=0 + ).fit(X, y) - importances = sum(tree.tree_.compute_feature_importances(normalize=False) - for tree in clf.estimators_) / clf.n_estimators + importances = ( + sum( + tree.tree_.compute_feature_importances(normalize=False) + for tree in clf.estimators_ + ) + / clf.n_estimators + ) # Check correctness assert_almost_equal(entropy(y), sum(importances)) assert np.abs(true_importances - importances).mean() < 0.01 -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_unfitted_feature_importances(name): - err_msg = ("This {} instance is not fitted yet. Call 'fit' with " - "appropriate arguments before using this estimator." - .format(name)) + err_msg = ( + "This {} instance is not fitted yet. Call 'fit' with " + "appropriate arguments before using this estimator.".format(name) + ) with pytest.raises(NotFittedError, match=err_msg): - getattr(FOREST_ESTIMATORS[name](), 'feature_importances_') + getattr(FOREST_ESTIMATORS[name](), "feature_importances_") -@pytest.mark.parametrize( - "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() -) +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) def test_forest_estimator_oob_importances(ForestEstimator): # Check that oob permutation importances correctly identify that # there are 3 important features - n_samples, n_features, n_informative, n_redundant, n_repeated = ( - 500, 10, 3, 0, 0 - ) + n_samples, n_features, n_informative, n_redundant, n_repeated = (500, 10, 3, 0, 0) estimator = ForestEstimator( n_estimators=10, feature_importances="permutation_oob", @@ -487,7 +507,7 @@ def test_forest_estimator_oob_importances(ForestEstimator): n_features=n_features, n_informative=n_informative, shuffle=False, - random_state=100 + random_state=100, ) imp_level = 0.1 @@ -500,9 +520,7 @@ def test_forest_estimator_oob_importances(ForestEstimator): assert_array_less(importances[n_informative:], imp_level) -@pytest.mark.parametrize( - "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() -) +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) def test_forest_estimator_oob_importance_ignores_random(ForestEstimator): # Testing that a random feature with high cardinality registers as # important using impurity-based feature importance but not out-of-bag @@ -515,7 +533,7 @@ def test_forest_estimator_oob_importance_ignores_random(ForestEstimator): n_redundant=0, n_repeated=0, shuffle=False, - random_state=0 + random_state=0, ) # Dichotomize all except for the last feature so that one non-informative @@ -554,21 +572,21 @@ def test_forest_estimator_oob_importance_ignores_random(ForestEstimator): assert impurity_importances[-1] > imp_level -@pytest.mark.parametrize( - "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() -) +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) @pytest.mark.parametrize( "params", [ {"feature_importances": "permutation_oob", "bootstrap": True}, - {"feature_importances": "impurity"} - ] + {"feature_importances": "impurity"}, + ], ) def test_forest_importances_attribute(ForestEstimator, params): # check the fitted attribute `importances_` n_samples, n_features, n_estimators = 500, 5, 10 X, y = make_classification( - n_samples=n_samples, n_features=n_features, random_state=42, + n_samples=n_samples, + n_features=n_features, + random_state=42, ) forest = ForestEstimator(n_estimators=n_estimators, **params).fit(X, y) @@ -580,8 +598,7 @@ def test_forest_importances_attribute(ForestEstimator, params): # impurity-based feature importances are normalized assert_allclose( forest.importances_.importances_mean, - (forest.feature_importances_ / - forest.importances_.importances_mean.sum()) + (forest.feature_importances_ / forest.importances_.importances_mean.sum()), ) else: assert_allclose( @@ -589,9 +606,7 @@ def test_forest_importances_attribute(ForestEstimator, params): ) -@pytest.mark.parametrize( - "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() -) +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) def test_default_sample_weights_oob(ForestEstimator): # Check that setting sample_weight to np.ones(...) is same as default n_samples = 500 @@ -619,26 +634,26 @@ def test_default_sample_weights_oob(ForestEstimator): clf_oob_numpy = clone(clf_oob_default) clf_oob_numpy.fit(X, y, sample_weight=np.ones(n_samples)) - assert ( - clf_oob_default.oob_score_ == pytest.approx(clf_oob_numpy.oob_score_) - ) + assert clf_oob_default.oob_score_ == pytest.approx(clf_oob_numpy.oob_score_) assert_allclose( clf_oob_default.feature_importances_, clf_oob_numpy.feature_importances_, ) -@pytest.mark.parametrize( - "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() -) +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) @pytest.mark.parametrize( "params, err_msg", [ - ({"feature_importances": "xxx"}, - "feature_importances should be 'impurity' or 'permutation_oob'"), - ({"feature_importances": "permutation_oob", "bootstrap": False}, - "Estimating feature importance on out of bag samples only") - ] + ( + {"feature_importances": "xxx"}, + "feature_importances should be 'impurity' or 'permutation_oob'", + ), + ( + {"feature_importances": "permutation_oob", "bootstrap": False}, + "Estimating feature importance on out of bag samples only", + ), + ], ) def test_forest_oob_importances_error(ForestEstimator, params, err_msg): # check that proper error messages are raised for feature_importances @@ -663,9 +678,7 @@ def test_forest_transformer_no_oob_importance(ForestTransformer): "X, y, lower_bound_accuracy", [ ( - *datasets.make_classification( - n_samples=300, n_classes=2, random_state=0 - ), + *datasets.make_classification(n_samples=300, n_classes=2, random_state=0), 0.9, ), ( @@ -675,26 +688,30 @@ def test_forest_transformer_no_oob_importance(ForestTransformer): 0.65, ), ( - iris.data, iris.target * 2 + 1, 0.65, + iris.data, + iris.target * 2 + 1, + 0.65, ), ( - *datasets.make_multilabel_classification( - n_samples=300, random_state=0 - ), + *datasets.make_multilabel_classification(n_samples=300, random_state=0), 0.18, ), ], ) -def test_forest_classifier_oob( - ForestClassifier, X, y, X_type, lower_bound_accuracy -): +def test_forest_classifier_oob(ForestClassifier, X, y, X_type, lower_bound_accuracy): """Check that OOB score is close to score on a test set.""" X = _convert_container(X, constructor_name=X_type) X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.5, random_state=0, + X, + y, + test_size=0.5, + random_state=0, ) classifier = ForestClassifier( - n_estimators=40, bootstrap=True, oob_score=True, random_state=0, + n_estimators=40, + bootstrap=True, + oob_score=True, + random_state=0, ) assert not hasattr(classifier, "oob_score_") @@ -736,17 +753,21 @@ def test_forest_classifier_oob( ), ], ) -def test_forest_regressor_oob( - ForestRegressor, X, y, X_type, lower_bound_r2 -): +def test_forest_regressor_oob(ForestRegressor, X, y, X_type, lower_bound_r2): """Check that forest-based regressor provide an OOB score close to the score on a test set.""" X = _convert_container(X, constructor_name=X_type) X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.5, random_state=0, + X, + y, + test_size=0.5, + random_state=0, ) regressor = ForestRegressor( - n_estimators=50, bootstrap=True, oob_score=True, random_state=0, + n_estimators=50, + bootstrap=True, + oob_score=True, + random_state=0, ) assert not hasattr(regressor, "oob_score_") @@ -769,31 +790,37 @@ def test_forest_regressor_oob( assert regressor.oob_prediction_.shape == expected_shape -@pytest.mark.parametrize( - "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() -) +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) def test_forest_oob_warning(ForestEstimator): """Check that a warning is raised when not enough estimator and the OOB estimates will be inacurrate.""" estimator = ForestEstimator( - n_estimators=1, oob_score=True, bootstrap=True, random_state=0, + n_estimators=1, + oob_score=True, + bootstrap=True, + random_state=0, ) with pytest.warns(UserWarning, match="Some inputs do not have OOB scores"): estimator.fit(iris.data, iris.target) -@pytest.mark.parametrize( - "ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values() -) +@pytest.mark.parametrize("ForestEstimator", FOREST_CLASSIFIERS_REGRESSORS.values()) @pytest.mark.parametrize( "X, y, params, err_msg", [ - (iris.data, iris.target, {"oob_score": True, "bootstrap": False}, - "Out of bag estimation only available if bootstrap=True"), - (iris.data, rng.randint(low=0, high=5, size=(iris.data.shape[0], 2)), - {"oob_score": True, "bootstrap": True}, - "The type of target cannot be used to compute OOB estimates") - ] + ( + iris.data, + iris.target, + {"oob_score": True, "bootstrap": False}, + "Out of bag estimation only available if bootstrap=True", + ), + ( + iris.data, + rng.randint(low=0, high=5, size=(iris.data.shape[0], 2)), + {"oob_score": True, "bootstrap": True}, + "The type of target cannot be used to compute OOB estimates", + ), + ], ) def test_forest_oob_error(ForestEstimator, X, y, params, err_msg): estimator = ForestEstimator(**params) @@ -811,11 +838,11 @@ def test_random_trees_embedding_raise_error_oob(oob_score): def check_gridsearch(name): forest = FOREST_CLASSIFIERS[name]() - clf = GridSearchCV(forest, {'n_estimators': (1, 2), 'max_depth': (1, 2)}) + clf = GridSearchCV(forest, {"n_estimators": (1, 2), "max_depth": (1, 2)}) clf.fit(iris.data, iris.target) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_gridsearch(name): # Check that base trees can be grid-searched. check_gridsearch(name) @@ -836,7 +863,7 @@ def check_parallel(name, X, y): assert_array_almost_equal(y1, y2, 3) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) def test_parallel(name): if name in FOREST_CLASSIFIERS: X = iris.data @@ -863,7 +890,7 @@ def check_pickle(name, X, y): assert score == score2 -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) def test_pickle(name): if name in FOREST_CLASSIFIERS: X = iris.data @@ -878,10 +905,34 @@ def test_pickle(name): def check_multioutput(name): # Check estimators on multi-output problems. - X_train = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-2, 1], - [-1, 1], [-1, 2], [2, -1], [1, -1], [1, -2]] - y_train = [[-1, 0], [-1, 0], [-1, 0], [1, 1], [1, 1], [1, 1], [-1, 2], - [-1, 2], [-1, 2], [1, 3], [1, 3], [1, 3]] + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + [-1, 0], + [-1, 0], + [-1, 0], + [1, 1], + [1, 1], + [1, 1], + [-1, 2], + [-1, 2], + [-1, 2], + [1, 3], + [1, 3], + [1, 3], + ] X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] @@ -902,24 +953,50 @@ def check_multioutput(name): assert log_proba[1].shape == (4, 4) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) def test_multioutput(name): check_multioutput(name) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_multioutput_string(name): # Check estimators on multi-output problems with string outputs. - X_train = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-2, 1], - [-1, 1], [-1, 2], [2, -1], [1, -1], [1, -2]] - y_train = [["red", "blue"], ["red", "blue"], ["red", "blue"], - ["green", "green"], ["green", "green"], ["green", "green"], - ["red", "purple"], ["red", "purple"], ["red", "purple"], - ["green", "yellow"], ["green", "yellow"], ["green", "yellow"]] + X_train = [ + [-2, -1], + [-1, -1], + [-1, -2], + [1, 1], + [1, 2], + [2, 1], + [-2, 1], + [-1, 1], + [-1, 2], + [2, -1], + [1, -1], + [1, -2], + ] + y_train = [ + ["red", "blue"], + ["red", "blue"], + ["red", "blue"], + ["green", "green"], + ["green", "green"], + ["green", "green"], + ["red", "purple"], + ["red", "purple"], + ["red", "purple"], + ["green", "yellow"], + ["green", "yellow"], + ["green", "yellow"], + ] X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] - y_test = [["red", "blue"], ["green", "green"], - ["red", "purple"], ["green", "yellow"]] + y_test = [ + ["red", "blue"], + ["green", "green"], + ["red", "purple"], + ["green", "yellow"], + ] est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False) y_pred = est.fit(X_train, y_train).predict(X_test) @@ -955,7 +1032,7 @@ def check_classes_shape(name): assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]]) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_classes_shape(name): check_classes_shape(name) @@ -978,10 +1055,12 @@ def test_random_trees_dense_equal(): # works by returning the same array for both argument values. # Create the RTEs - hasher_dense = RandomTreesEmbedding(n_estimators=10, sparse_output=False, - random_state=0) - hasher_sparse = RandomTreesEmbedding(n_estimators=10, sparse_output=True, - random_state=0) + hasher_dense = RandomTreesEmbedding( + n_estimators=10, sparse_output=False, random_state=0 + ) + hasher_sparse = RandomTreesEmbedding( + n_estimators=10, sparse_output=True, random_state=0 + ) X, y = datasets.make_circles(factor=0.5) X_transformed_dense = hasher_dense.fit_transform(X) X_transformed_sparse = hasher_sparse.fit_transform(X) @@ -1003,8 +1082,7 @@ def test_random_hasher(): # test fit and transform: hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) - assert_array_equal(hasher.fit(X).transform(X).toarray(), - X_transformed.toarray()) + assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray()) # one leaf active per data point per forest assert X_transformed.shape[0] == X.shape[0] @@ -1013,7 +1091,7 @@ def test_random_hasher(): X_reduced = svd.fit_transform(X_transformed) linear_clf = LinearSVC() linear_clf.fit(X_reduced, y) - assert linear_clf.score(X_reduced, y) == 1. + assert linear_clf.score(X_reduced, y) == 1.0 def test_random_hasher_sparse_data(): @@ -1031,8 +1109,9 @@ def test_parallel_train(): y_train = rng.randint(0, 2, n_samples) clfs = [ - RandomForestClassifier(n_estimators=20, n_jobs=n_jobs, - random_state=12345).fit(X_train, y_train) + RandomForestClassifier(n_estimators=20, n_jobs=n_jobs, random_state=12345).fit( + X_train, y_train + ) for n_jobs in [1, 2, 3, 8, 16, 32] ] @@ -1054,14 +1133,14 @@ def test_distribution(): uniques = defaultdict(int) for tree in reg.estimators_: - tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-") - for f, t in zip(tree.tree_.feature, - tree.tree_.threshold)) + tree = "".join( + ("%d,%d/" % (f, int(t)) if f >= 0 else "-") + for f, t in zip(tree.tree_.feature, tree.tree_.threshold) + ) uniques[tree] += 1 - uniques = sorted([(1. * count / n_trees, tree) - for tree, count in uniques.items()]) + uniques = sorted([(1.0 * count / n_trees, tree) for tree, count in uniques.items()]) # On a single variable problem where X_0 has 4 equiprobable values, there # are 5 ways to build a random tree. The more compact (0,1/0,0/--0,2/--) of @@ -1085,9 +1164,10 @@ def test_distribution(): uniques = defaultdict(int) for tree in reg.estimators_: - tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-") - for f, t in zip(tree.tree_.feature, - tree.tree_.threshold)) + tree = "".join( + ("%d,%d/" % (f, int(t)) if f >= 0 else "-") + for f, t in zip(tree.tree_.feature, tree.tree_.threshold) + ) uniques[tree] += 1 @@ -1100,16 +1180,16 @@ def check_max_leaf_nodes_max_depth(name): # Test precedence of max_leaf_nodes over max_depth. ForestEstimator = FOREST_ESTIMATORS[name] - est = ForestEstimator(max_depth=1, max_leaf_nodes=4, - n_estimators=1, random_state=0).fit(X, y) + est = ForestEstimator( + max_depth=1, max_leaf_nodes=4, n_estimators=1, random_state=0 + ).fit(X, y) assert est.estimators_[0].get_depth() == 1 - est = ForestEstimator(max_depth=1, n_estimators=1, - random_state=0).fit(X, y) + est = ForestEstimator(max_depth=1, n_estimators=1, random_state=0).fit(X, y) assert est.estimators_[0].get_depth() == 1 -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_max_leaf_nodes_max_depth(name): check_max_leaf_nodes_max_depth(name) @@ -1131,20 +1211,17 @@ def check_min_samples_split(name): node_idx = est.estimators_[0].tree_.children_left != -1 node_samples = est.estimators_[0].tree_.n_node_samples[node_idx] - assert np.min(node_samples) > len(X) * 0.5 - 1, ( - "Failed with {0}".format(name)) + assert np.min(node_samples) > len(X) * 0.5 - 1, "Failed with {0}".format(name) - est = ForestEstimator(min_samples_split=0.5, n_estimators=1, - random_state=0) + est = ForestEstimator(min_samples_split=0.5, n_estimators=1, random_state=0) est.fit(X, y) node_idx = est.estimators_[0].tree_.children_left != -1 node_samples = est.estimators_[0].tree_.n_node_samples[node_idx] - assert np.min(node_samples) > len(X) * 0.5 - 1, ( - "Failed with {0}".format(name)) + assert np.min(node_samples) > len(X) * 0.5 - 1, "Failed with {0}".format(name) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_min_samples_split(name): check_min_samples_split(name) @@ -1169,18 +1246,16 @@ def check_min_samples_leaf(name): leaf_count = node_counts[node_counts != 0] assert np.min(leaf_count) > 4, "Failed with {0}".format(name) - est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, - random_state=0) + est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0) est.fit(X, y) out = est.estimators_[0].tree_.apply(X) node_counts = np.bincount(out) # drop inner nodes leaf_count = node_counts[node_counts != 0] - assert np.min(leaf_count) > len(X) * 0.25 - 1, ( - "Failed with {0}".format(name)) + assert np.min(leaf_count) > len(X) * 0.25 - 1, "Failed with {0}".format(name) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_min_samples_leaf(name): check_min_samples_leaf(name) @@ -1198,8 +1273,9 @@ def check_min_weight_fraction_leaf(name): # test both DepthFirstTreeBuilder and BestFirstTreeBuilder # by setting max_leaf_nodes for frac in np.linspace(0, 0.5, 6): - est = ForestEstimator(min_weight_fraction_leaf=frac, n_estimators=1, - random_state=0) + est = ForestEstimator( + min_weight_fraction_leaf=frac, n_estimators=1, random_state=0 + ) if "RandomForest" in name: est.bootstrap = False @@ -1209,13 +1285,13 @@ def check_min_weight_fraction_leaf(name): # drop inner nodes leaf_weights = node_weights[node_weights != 0] assert ( - np.min(leaf_weights) >= - total_weight * est.min_weight_fraction_leaf), ( - "Failed with {0} min_weight_fraction_leaf={1}".format( - name, est.min_weight_fraction_leaf)) + np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf + ), "Failed with {0} min_weight_fraction_leaf={1}".format( + name, est.min_weight_fraction_leaf + ) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_min_weight_fraction_leaf(name): check_min_weight_fraction_leaf(name) @@ -1230,28 +1306,29 @@ def check_sparse_input(name, X, X_sparse, y): if name in FOREST_CLASSIFIERS or name in FOREST_REGRESSORS: assert_array_almost_equal(sparse.predict(X), dense.predict(X)) - assert_array_almost_equal(sparse.feature_importances_, - dense.feature_importances_) + assert_array_almost_equal( + sparse.feature_importances_, dense.feature_importances_ + ) if name in FOREST_CLASSIFIERS: - assert_array_almost_equal(sparse.predict_proba(X), - dense.predict_proba(X)) - assert_array_almost_equal(sparse.predict_log_proba(X), - dense.predict_log_proba(X)) + assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X)) + assert_array_almost_equal( + sparse.predict_log_proba(X), dense.predict_log_proba(X) + ) if name in FOREST_TRANSFORMERS: - assert_array_almost_equal(sparse.transform(X).toarray(), - dense.transform(X).toarray()) - assert_array_almost_equal(sparse.fit_transform(X).toarray(), - dense.fit_transform(X).toarray()) + assert_array_almost_equal( + sparse.transform(X).toarray(), dense.transform(X).toarray() + ) + assert_array_almost_equal( + sparse.fit_transform(X).toarray(), dense.fit_transform(X).toarray() + ) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) -@pytest.mark.parametrize('sparse_matrix', - (csr_matrix, csc_matrix, coo_matrix)) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) +@pytest.mark.parametrize("sparse_matrix", (csr_matrix, csc_matrix, coo_matrix)) def test_sparse_input(name, sparse_matrix): - X, y = datasets.make_multilabel_classification(random_state=0, - n_samples=50) + X, y = datasets.make_multilabel_classification(random_state=0, n_samples=50) check_sparse_input(name, X, sparse_matrix(X), y) @@ -1303,8 +1380,8 @@ def check_memory_layout(name, dtype): assert_array_almost_equal(est.fit(X, y).predict(X), y) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) -@pytest.mark.parametrize('dtype', (np.float64, np.float32)) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("dtype", (np.float64, np.float32)) def test_memory_layout(name, dtype): check_memory_layout(name, dtype) @@ -1323,7 +1400,7 @@ def check_1d_input(name, X, X_2d, y): est.predict(X) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_1d_input(name): X = iris.data[:, 0] X_2d = iris.data[:, 0].reshape((-1, 1)) @@ -1340,28 +1417,32 @@ def check_class_weights(name): # Iris is balanced, so no effect expected for using 'balanced' weights clf1 = ForestClassifier(random_state=0) clf1.fit(iris.data, iris.target) - clf2 = ForestClassifier(class_weight='balanced', random_state=0) + clf2 = ForestClassifier(class_weight="balanced", random_state=0) clf2.fit(iris.data, iris.target) assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) # Make a multi-output problem with three copies of Iris iris_multi = np.vstack((iris.target, iris.target, iris.target)).T # Create user-defined weights that should balance over the outputs - clf3 = ForestClassifier(class_weight=[{0: 2., 1: 2., 2: 1.}, - {0: 2., 1: 1., 2: 2.}, - {0: 1., 1: 2., 2: 2.}], - random_state=0) + clf3 = ForestClassifier( + class_weight=[ + {0: 2.0, 1: 2.0, 2: 1.0}, + {0: 2.0, 1: 1.0, 2: 2.0}, + {0: 1.0, 1: 2.0, 2: 2.0}, + ], + random_state=0, + ) clf3.fit(iris.data, iris_multi) assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_) # Check against multi-output "balanced" which should also have no effect - clf4 = ForestClassifier(class_weight='balanced', random_state=0) + clf4 = ForestClassifier(class_weight="balanced", random_state=0) clf4.fit(iris.data, iris_multi) assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_) # Inflate importance of class 1, check against user-defined weights sample_weight = np.ones(iris.target.shape) sample_weight[iris.target == 1] *= 100 - class_weight = {0: 1., 1: 100., 2: 1.} + class_weight = {0: 1.0, 1: 100.0, 2: 1.0} clf1 = ForestClassifier(random_state=0) clf1.fit(iris.data, iris.target, sample_weight) clf2 = ForestClassifier(class_weight=class_weight, random_state=0) @@ -1376,7 +1457,7 @@ def check_class_weights(name): assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_class_weights(name): check_class_weights(name) @@ -1385,17 +1466,18 @@ def check_class_weight_balanced_and_bootstrap_multi_output(name): # Test class_weight works for multi-output""" ForestClassifier = FOREST_CLASSIFIERS[name] _y = np.vstack((y, np.array(y) * 2)).T - clf = ForestClassifier(class_weight='balanced', random_state=0) + clf = ForestClassifier(class_weight="balanced", random_state=0) clf.fit(X, _y) - clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}, {-2: 1., 2: 1.}], - random_state=0) + clf = ForestClassifier( + class_weight=[{-1: 0.5, 1: 1.0}, {-2: 1.0, 2: 1.0}], random_state=0 + ) clf.fit(X, _y) # smoke test for balanced subsample - clf = ForestClassifier(class_weight='balanced_subsample', random_state=0) + clf = ForestClassifier(class_weight="balanced_subsample", random_state=0) clf.fit(X, _y) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_class_weight_balanced_and_bootstrap_multi_output(name): check_class_weight_balanced_and_bootstrap_multi_output(name) @@ -1406,20 +1488,18 @@ def check_class_weight_errors(name): _y = np.vstack((y, np.array(y) * 2)).T # Invalid preset string - clf = ForestClassifier(class_weight='the larch', random_state=0) + clf = ForestClassifier(class_weight="the larch", random_state=0) with pytest.raises(ValueError): clf.fit(X, y) with pytest.raises(ValueError): clf.fit(X, _y) # Warning warm_start with preset - clf = ForestClassifier(class_weight='balanced', warm_start=True, - random_state=0) + clf = ForestClassifier(class_weight="balanced", warm_start=True, random_state=0) clf.fit(X, y) warn_msg = ( - "Warm-start fitting without increasing n_estimators does not fit new " - "trees." + "Warm-start fitting without increasing n_estimators does not fit new " "trees." ) with pytest.warns(UserWarning, match=warn_msg): clf.fit(X, _y) @@ -1430,12 +1510,12 @@ def check_class_weight_errors(name): clf.fit(X, _y) # Incorrect length list for multi-output - clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.}], random_state=0) + clf = ForestClassifier(class_weight=[{-1: 0.5, 1: 1.0}], random_state=0) with pytest.raises(ValueError): clf.fit(X, _y) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_class_weight_errors(name): check_class_weight_errors(name) @@ -1448,26 +1528,29 @@ def check_warm_start(name, random_state=42): est_ws = None for n_estimators in [5, 10]: if est_ws is None: - est_ws = ForestEstimator(n_estimators=n_estimators, - random_state=random_state, - warm_start=True) + est_ws = ForestEstimator( + n_estimators=n_estimators, random_state=random_state, warm_start=True + ) else: est_ws.set_params(n_estimators=n_estimators) est_ws.fit(X, y) assert len(est_ws) == n_estimators - est_no_ws = ForestEstimator(n_estimators=10, random_state=random_state, - warm_start=False) + est_no_ws = ForestEstimator( + n_estimators=10, random_state=random_state, warm_start=False + ) est_no_ws.fit(X, y) - assert (set([tree.random_state for tree in est_ws]) == - set([tree.random_state for tree in est_no_ws])) + assert set([tree.random_state for tree in est_ws]) == set( + [tree.random_state for tree in est_no_ws] + ) - assert_array_equal(est_ws.apply(X), est_no_ws.apply(X), - err_msg="Failed with {0}".format(name)) + assert_array_equal( + est_ws.apply(X), est_no_ws.apply(X), err_msg="Failed with {0}".format(name) + ) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_warm_start(name): check_warm_start(name) @@ -1476,12 +1559,12 @@ def check_warm_start_clear(name): # Test if fit clears state and grows a new forest when warm_start==False. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] - est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, - random_state=1) + est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1) est.fit(X, y) - est_2 = ForestEstimator(n_estimators=5, max_depth=1, warm_start=True, - random_state=2) + est_2 = ForestEstimator( + n_estimators=5, max_depth=1, warm_start=True, random_state=2 + ) est_2.fit(X, y) # inits state est_2.set_params(warm_start=False, random_state=1) est_2.fit(X, y) # clears old state and equals est @@ -1489,7 +1572,7 @@ def check_warm_start_clear(name): assert_array_almost_equal(est_2.apply(X), est.apply(X)) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_warm_start_clear(name): check_warm_start_clear(name) @@ -1505,7 +1588,7 @@ def check_warm_start_smaller_n_estimators(name): est.fit(X, y) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_warm_start_smaller_n_estimators(name): check_warm_start_smaller_n_estimators(name) @@ -1515,19 +1598,18 @@ def check_warm_start_equal_n_estimators(name): # same forest and raises a warning. X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] - est = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, - random_state=1) + est = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, random_state=1) est.fit(X, y) - est_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=True, - random_state=1) + est_2 = ForestEstimator( + n_estimators=5, max_depth=3, warm_start=True, random_state=1 + ) est_2.fit(X, y) # Now est_2 equals est. est_2.set_params(random_state=2) warn_msg = ( - "Warm-start fitting without increasing n_estimators does not fit " - "new trees." + "Warm-start fitting without increasing n_estimators does not fit " "new trees." ) with pytest.warns(UserWarning, match=warn_msg): est_2.fit(X, y) @@ -1536,7 +1618,7 @@ def check_warm_start_equal_n_estimators(name): assert_array_equal(est.apply(X), est_2.apply(X)) -@pytest.mark.parametrize('name', FOREST_ESTIMATORS) +@pytest.mark.parametrize("name", FOREST_ESTIMATORS) def test_warm_start_equal_n_estimators(name): check_warm_start_equal_n_estimators(name) @@ -1546,26 +1628,44 @@ def check_warm_start_oob(name): X, y = hastie_X, hastie_y ForestEstimator = FOREST_ESTIMATORS[name] # Use 15 estimators to avoid 'some inputs do not have OOB scores' warning. - est = ForestEstimator(n_estimators=15, max_depth=3, warm_start=False, - random_state=1, bootstrap=True, oob_score=True) + est = ForestEstimator( + n_estimators=15, + max_depth=3, + warm_start=False, + random_state=1, + bootstrap=True, + oob_score=True, + ) est.fit(X, y) - est_2 = ForestEstimator(n_estimators=5, max_depth=3, warm_start=False, - random_state=1, bootstrap=True, oob_score=False) + est_2 = ForestEstimator( + n_estimators=5, + max_depth=3, + warm_start=False, + random_state=1, + bootstrap=True, + oob_score=False, + ) est_2.fit(X, y) est_2.set_params(warm_start=True, oob_score=True, n_estimators=15) est_2.fit(X, y) - assert hasattr(est_2, 'oob_score_') + assert hasattr(est_2, "oob_score_") assert est.oob_score_ == est_2.oob_score_ # Test that oob_score is computed even if we don't need to train # additional trees. - est_3 = ForestEstimator(n_estimators=15, max_depth=3, warm_start=True, - random_state=1, bootstrap=True, oob_score=False) + est_3 = ForestEstimator( + n_estimators=15, + max_depth=3, + warm_start=True, + random_state=1, + bootstrap=True, + oob_score=False, + ) est_3.fit(X, y) - assert not hasattr(est_3, 'oob_score_') + assert not hasattr(est_3, "oob_score_") est_3.set_params(oob_score=True) ignore_warnings(est_3.fit)(X, y) @@ -1573,7 +1673,7 @@ def check_warm_start_oob(name): assert est.oob_score_ == est_3.oob_score_ -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) def test_warm_start_oob(name): check_warm_start_oob(name) @@ -1582,7 +1682,7 @@ def test_dtype_convert(n_classes=15): classifier = RandomForestClassifier(random_state=0, bootstrap=False) X = np.eye(n_classes) - y = [ch for ch in 'ABCDEFGHIJKLMNOPQRSTU'[:n_classes]] + y = [ch for ch in "ABCDEFGHIJKLMNOPQRSTU"[:n_classes]] result = classifier.fit(X, y).predict(X) assert_array_equal(classifier.classes_, y) @@ -1593,33 +1693,39 @@ def check_decision_path(name): X, y = hastie_X, hastie_y n_samples = X.shape[0] ForestEstimator = FOREST_ESTIMATORS[name] - est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, - random_state=1) + est = ForestEstimator(n_estimators=5, max_depth=1, warm_start=False, random_state=1) est.fit(X, y) indicator, n_nodes_ptr = est.decision_path(X) assert indicator.shape[1] == n_nodes_ptr[-1] assert indicator.shape[0] == n_samples - assert_array_equal(np.diff(n_nodes_ptr), - [e.tree_.node_count for e in est.estimators_]) + assert_array_equal( + np.diff(n_nodes_ptr), [e.tree_.node_count for e in est.estimators_] + ) # Assert that leaves index are correct leaves = est.apply(X) for est_id in range(leaves.shape[1]): - leave_indicator = [indicator[i, n_nodes_ptr[est_id] + j] - for i, j in enumerate(leaves[:, est_id])] + leave_indicator = [ + indicator[i, n_nodes_ptr[est_id] + j] + for i, j in enumerate(leaves[:, est_id]) + ] assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples)) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) def test_decision_path(name): check_decision_path(name) def test_min_impurity_decrease(): X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1) - all_estimators = [RandomForestClassifier, RandomForestRegressor, - ExtraTreesClassifier, ExtraTreesRegressor] + all_estimators = [ + RandomForestClassifier, + RandomForestRegressor, + ExtraTreesClassifier, + ExtraTreesRegressor, + ] for Estimator in all_estimators: est = Estimator(min_impurity_decrease=0.1) @@ -1635,14 +1741,18 @@ def test_poisson_y_positive_check(): X = np.zeros((3, 3)) y = [-1, 1, 3] - err_msg = (r"Some value\(s\) of y are negative which is " - r"not allowed for Poisson regression.") + err_msg = ( + r"Some value\(s\) of y are negative which is " + r"not allowed for Poisson regression." + ) with pytest.raises(ValueError, match=err_msg): est.fit(X, y) y = [0, 0, 0] - err_msg = (r"Sum of y is not strictly positive which " - r"is necessary for Poisson regression.") + err_msg = ( + r"Sum of y is not strictly positive which " + r"is necessary for Poisson regression." + ) with pytest.raises(ValueError, match=err_msg): est.fit(X, y) @@ -1658,11 +1768,13 @@ def start_call(self): return super().start_call() -joblib.register_parallel_backend('testing', MyBackend) +joblib.register_parallel_backend("testing", MyBackend) -@pytest.mark.skipif(parse_version(joblib.__version__) < parse_version('0.12'), - reason='tests not yet supported in joblib <0.12') +@pytest.mark.skipif( + parse_version(joblib.__version__) < parse_version("0.12"), + reason="tests not yet supported in joblib <0.12", +) @skip_if_no_parallel def test_backend_respected(): clf = RandomForestClassifier(n_estimators=10, n_jobs=2) @@ -1680,10 +1792,12 @@ def test_backend_respected(): def test_forest_feature_importances_sum(): - X, y = make_classification(n_samples=15, n_informative=3, random_state=1, - n_classes=3) - clf = RandomForestClassifier(min_samples_leaf=5, random_state=42, - n_estimators=200).fit(X, y) + X, y = make_classification( + n_samples=15, n_informative=3, random_state=1, n_classes=3 + ) + clf = RandomForestClassifier( + min_samples_leaf=5, random_state=42, n_estimators=200 + ).fit(X, y) assert math.isclose(1, clf.feature_importances_.sum(), abs_tol=1e-7) @@ -1692,29 +1806,50 @@ def test_forest_degenerate_feature_importances(): X = np.zeros((10, 10)) y = np.ones((10,)) gbr = RandomForestRegressor(n_estimators=10).fit(X, y) - assert_array_equal(gbr.feature_importances_, - np.zeros(10, dtype=np.float64)) + assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64)) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS_REGRESSORS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS_REGRESSORS) @pytest.mark.parametrize( - 'max_samples, exc_type, exc_msg', - [(int(1e9), ValueError, - "`max_samples` must be in range 1 to 6 but got value 1000000000"), - (2.0, ValueError, - r"`max_samples` must be in range \(0.0, 1.0\] but got value 2.0"), - (0.0, ValueError, - r"`max_samples` must be in range \(0.0, 1.0\] but got value 0.0"), - (np.nan, ValueError, - r"`max_samples` must be in range \(0.0, 1.0\] but got value nan"), - (np.inf, ValueError, - r"`max_samples` must be in range \(0.0, 1.0\] but got value inf"), - ('str max_samples?!', TypeError, - r"`max_samples` should be int or float, but got " - r"type '\'"), - (np.ones(2), TypeError, - r"`max_samples` should be int or float, but got type " - r"'\'")] + "max_samples, exc_type, exc_msg", + [ + ( + int(1e9), + ValueError, + "`max_samples` must be in range 1 to 6 but got value 1000000000", + ), + ( + 2.0, + ValueError, + r"`max_samples` must be in range \(0.0, 1.0\] but got value 2.0", + ), + ( + 0.0, + ValueError, + r"`max_samples` must be in range \(0.0, 1.0\] but got value 0.0", + ), + ( + np.nan, + ValueError, + r"`max_samples` must be in range \(0.0, 1.0\] but got value nan", + ), + ( + np.inf, + ValueError, + r"`max_samples` must be in range \(0.0, 1.0\] but got value inf", + ), + ( + "str max_samples?!", + TypeError, + r"`max_samples` should be int or float, but got " r"type '\'", + ), + ( + np.ones(2), + TypeError, + r"`max_samples` should be int or float, but got type " + r"'\'", + ), + ], ) def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg): # Check invalid `max_samples` values @@ -1723,10 +1858,11 @@ def test_max_samples_exceptions(name, max_samples, exc_type, exc_msg): est.fit(X, y) -@pytest.mark.parametrize('name', FOREST_REGRESSORS) +@pytest.mark.parametrize("name", FOREST_REGRESSORS) def test_max_samples_boundary_regressors(name): X_train, X_test, y_train, y_test = train_test_split( - X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0) + X_reg, y_reg, train_size=0.7, test_size=0.3, random_state=0 + ) ms_1_model = FOREST_REGRESSORS[name](max_samples=1.0, random_state=0) ms_1_predict = ms_1_model.fit(X_train, y_train).predict(X_test) @@ -1740,10 +1876,11 @@ def test_max_samples_boundary_regressors(name): assert ms_1_ms == pytest.approx(ms_None_ms) -@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +@pytest.mark.parametrize("name", FOREST_CLASSIFIERS) def test_max_samples_boundary_classifiers(name): X_train, X_test, y_train, _ = train_test_split( - X_large, y_large, random_state=0, stratify=y_large) + X_large, y_large, random_state=0, stratify=y_large + ) ms_1_model = FOREST_CLASSIFIERS[name](max_samples=1.0, random_state=0) ms_1_proba = ms_1_model.fit(X_train, y_train).predict_proba(X_test) @@ -1763,9 +1900,7 @@ def test_forest_y_sparse(): est.fit(X, y) -@pytest.mark.parametrize( - 'ForestClass', [RandomForestClassifier, RandomForestRegressor] -) +@pytest.mark.parametrize("ForestClass", [RandomForestClassifier, RandomForestRegressor]) def test_little_tree_with_small_max_samples(ForestClass): rng = np.random.RandomState(1) @@ -1799,9 +1934,13 @@ def test_little_tree_with_small_max_samples(ForestClass): # FIXME: remove in 1.2 @pytest.mark.parametrize( "Estimator", - [ExtraTreesClassifier, ExtraTreesRegressor, - RandomForestClassifier, RandomForestRegressor, - RandomTreesEmbedding] + [ + ExtraTreesClassifier, + ExtraTreesRegressor, + RandomForestClassifier, + RandomForestRegressor, + RandomTreesEmbedding, + ], ) def test_n_features_deprecation(Estimator): # Check that we raise the proper deprecation warning if accessing @@ -1815,15 +1954,19 @@ def test_n_features_deprecation(Estimator): # TODO: Remove in v1.2 -@pytest.mark.parametrize("old_criterion, new_criterion", [ - ("mse", "squared_error"), - ("mae", "absolute_error"), -]) +@pytest.mark.parametrize( + "old_criterion, new_criterion", + [ + ("mse", "squared_error"), + ("mae", "absolute_error"), + ], +) def test_criterion_deprecated(old_criterion, new_criterion): est1 = RandomForestRegressor(criterion=old_criterion, random_state=0) - with pytest.warns(FutureWarning, - match=f"Criterion '{old_criterion}' was deprecated"): + with pytest.warns( + FutureWarning, match=f"Criterion '{old_criterion}' was deprecated" + ): est1.fit(X, y) est2 = RandomForestRegressor(criterion=new_criterion, random_state=0) @@ -1831,7 +1974,7 @@ def test_criterion_deprecated(old_criterion, new_criterion): assert_allclose(est1.predict(X), est2.predict(X)) -@pytest.mark.parametrize('Forest', FOREST_REGRESSORS) +@pytest.mark.parametrize("Forest", FOREST_REGRESSORS) def test_mse_criterion_object_segfault_smoke_test(Forest): # This is a smoke test to ensure that passing a mutable criterion # does not cause a segfault when fitting with concurrent threads. @@ -1842,8 +1985,6 @@ def test_mse_criterion_object_segfault_smoke_test(Forest): y = y_reg.reshape(-1, 1) n_samples, n_outputs = y.shape mse_criterion = MSE(n_outputs, n_samples) - est = FOREST_REGRESSORS[Forest]( - n_estimators=2, n_jobs=2, criterion=mse_criterion - ) + est = FOREST_REGRESSORS[Forest](n_estimators=2, n_jobs=2, criterion=mse_criterion) est.fit(X_reg, y) From 096abe5fc84c7d2d5631ec4a49d6754b15017327 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Mon, 21 Jun 2021 21:42:45 -0400 Subject: [PATCH 50/53] remove old assert_allclose import --- sklearn/ensemble/tests/test_forest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 5f8d197c90c5f..dd18c2029f47f 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -27,7 +27,6 @@ import joblib from numpy.testing import assert_allclose -from sklearn.utils._testing import assert_allclose from sklearn.dummy import DummyRegressor from sklearn.metrics import mean_poisson_deviance from sklearn.utils._testing import assert_almost_equal From be2392ea1bd0b20f11fd539700ebc162c86eb9bb Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 22 Jun 2021 09:01:41 -0400 Subject: [PATCH 51/53] Apply suggested format changes --- doc/modules/ensemble.rst | 20 +++++++++----------- examples/ensemble/plot_forest_importances.py | 2 +- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index 6713fb9801a68..ed8bf2aebe2e7 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -355,17 +355,15 @@ MDI and the permutation feature importances are explored in: * :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py` .. topic:: References - .. [Strobl07] `Strobl, C., Boulesteix, AL., Zeileis, A. et al. - Bias in random forest variable importance measures: Illustrations, - sources and a solution. BMC Bioinformatics 8, 25 (2007). - `_ - .. [White94] `White, A.P., Liu, W.Z. Technical Note: - Bias in Information-Based Measures in Decision Tree Induction. - Machine Learning 15, 321–329 (1994). - `_ - .. [L2014] G. Louppe, - "Understanding Random Forests: From Theory to Practice", - PhD Thesis, U. of Liege, 2014. + .. [Strobl07] `Strobl, C., Boulesteix, AL., Zeileis, A. et al. + Bias in random forest variable importance measures: Illustrations, + sources and a solution. + BMC Bioinformatics 8, 25 (2007). + `_ + .. [White94] `White, A.P., Liu, W.Z. Technical Note: + Bias in Information-Based Measures in Decision Tree Induction. + Machine Learning 15, 321–329 (1994). + `_ .. _random_trees_embedding: diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index cd09008bfd0af..554c662058e56 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -48,7 +48,7 @@ # %% # Feature importance based on Mean Decrease in Impurity (MDI) -# ----------------------------------------------------- +# ----------------------------------------------------------- # Feature importances are provided by the fitted attribute # `feature_importances_` and they are computed as the mean and standard # deviation of accumulation of the impurity decrease within each tree. From d3ddaf8fd76a45f918801a928062d3f123f77409 Mon Sep 17 00:00:00 2001 From: robert-robison Date: Tue, 22 Jun 2021 09:40:26 -0400 Subject: [PATCH 52/53] Reference section edits --- doc/modules/ensemble.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst index ed8bf2aebe2e7..b8363e8b34d18 100644 --- a/doc/modules/ensemble.rst +++ b/doc/modules/ensemble.rst @@ -355,6 +355,7 @@ MDI and the permutation feature importances are explored in: * :ref:`sphx_glr_auto_examples_inspection_plot_permutation_importance.py` .. topic:: References + .. [Strobl07] `Strobl, C., Boulesteix, AL., Zeileis, A. et al. Bias in random forest variable importance measures: Illustrations, sources and a solution. @@ -364,6 +365,9 @@ MDI and the permutation feature importances are explored in: Bias in Information-Based Measures in Decision Tree Induction. Machine Learning 15, 321–329 (1994). `_ + .. [L2014] G. Louppe, + "Understanding Random Forests: From Theory to Practice", + PhD Thesis, U. of Liege, 2014. .. _random_trees_embedding: From 280f9d9b39b348edfce142204e74e1f05c691908 Mon Sep 17 00:00:00 2001 From: Robert Robison <69172120+robert-robison@users.noreply.github.com> Date: Fri, 30 Jul 2021 20:43:53 -0400 Subject: [PATCH 53/53] Update examples/ensemble/plot_forest_importances.py Co-authored-by: Julien Jerphanion --- examples/ensemble/plot_forest_importances.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/ensemble/plot_forest_importances.py b/examples/ensemble/plot_forest_importances.py index 554c662058e56..ff344d8fa4d59 100644 --- a/examples/ensemble/plot_forest_importances.py +++ b/examples/ensemble/plot_forest_importances.py @@ -9,6 +9,7 @@ We show two strategies to estimate the feature importances: (i) the impurity-based feature importances and (ii) the permutation feature importances on out-of-bag (OOB) samples. + .. warning:: Impurity-based feature importances can be misleading for high cardinality features (many unique values). Check the documentation of the