Skip to content

Unbiased MDI-like feature importance measure for random forests #31279

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 30 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
c0e22ea
First working implementation of UFI, does not support multi output, h…
Apr 14, 2025
b1e9df8
Removed the normalization inherited from the old MDI to avoid instabi…
Apr 15, 2025
2a694b6
added multi output support
Apr 15, 2025
fd0abfb
removed redundant cross_impurity computations
Apr 15, 2025
ef9f48d
added mdi_oob
Apr 16, 2025
a225a42
redesigned ufi for better memory management
Apr 17, 2025
83f3880
removed a debug import
Apr 17, 2025
27618db
added mdi_oob, cleaned the code
Apr 18, 2025
5ad9636
better unified the code between ufi and mdi_oob
Apr 18, 2025
21d2e04
fixed a call oversight
Apr 18, 2025
8194d6e
fixed an error in mdi_oob computations
Apr 18, 2025
9e16a09
changed tests on feature_importances_ to use unbiased FI too
Apr 22, 2025
8991d79
add tests to check that the added methods coincide with the papers an…
Apr 23, 2025
a9d2983
added support for regression (only MSE split)
Apr 24, 2025
710d42c
added warning for unbiased feature importance in classification witho…
Apr 24, 2025
ddedf27
merged test_non_OOB_unbiased_feature_importances_class & _reg
Apr 24, 2025
1de98fc
Fixed a few mistake so that ufi-regression matches feature_importance…
Apr 25, 2025
c7c5d76
Extended the tests on matching the paper values to regression
Apr 25, 2025
a44084d
re added tests on oob_score for dense X. They fail
Apr 25, 2025
082206c
revert a small change to a test
Apr 28, 2025
b028cb9
raise an error when calling unbiased feature importance with criterio…
Apr 28, 2025
dcb3106
adapted the tests to the previous commit
Apr 29, 2025
c61c8dc
Added log_loss ufi
Apr 29, 2025
d198f20
fixed the oob_score_ issue, simplified the self.value accesses
Apr 29, 2025
f2acf5f
updated api and tests for ufi with 'log_loss'
Apr 30, 2025
f41cf3f
divide by 2 ufi 'log_loss' and improve tests
Apr 30, 2025
af785d6
fix some linting
Apr 30, 2025
ccd4f18
fixed Cython linting
Apr 30, 2025
ac36aaa
added inline function for clarity and comments on available criteria
Apr 30, 2025
fda8349
Merge branch 'main' into main
ogrisel Apr 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 167 additions & 8 deletions sklearn/ensemble/_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,8 @@ def _compute_oob_predictions(self, X, y):
oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \
(n_samples, 1, n_outputs)
The OOB predictions.

oob_indices_per_tree
"""
# Prediction requires X to be in CSR format
if issparse(X):
Expand Down Expand Up @@ -601,7 +603,6 @@ def _compute_oob_predictions(self, X, y):
n_samples,
n_samples_bootstrap,
)

y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :])
oob_pred[unsampled_indices, ...] += y_pred
n_oob_pred[unsampled_indices, :] += 1
Expand Down Expand Up @@ -681,6 +682,87 @@ def feature_importances_(self):
all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
return all_importances / np.sum(all_importances)

def _compute_unbiased_feature_importance_and_oob_predictions_per_tree(
self, tree, X, y, method, n_samples
):
n_samples_bootstrap = _get_n_samples_bootstrap(
n_samples,
self.max_samples,
)
oob_indices = _generate_unsampled_indices(
tree.random_state, n_samples, n_samples_bootstrap
)
X_test = X[oob_indices]
y_test = y[oob_indices]

oob_pred = np.zeros(
(n_samples, self.estimators_[0].tree_.max_n_classes, self.n_outputs_),
dtype=np.float64,
)
n_oob_pred = np.zeros((n_samples, self.n_outputs_), dtype=np.intp)

importances, y_pred = (
tree.compute_unbiased_feature_importance_and_oob_predictions(
X_test=X_test,
y_test=y_test,
method=method,
)
)
oob_pred[oob_indices, :, :] += y_pred
n_oob_pred[oob_indices, :] += 1
return (importances, oob_pred, n_oob_pred)

def _compute_unbiased_feature_importance_and_oob_predictions(
self, X, y, method="ufi"
): # "mdi_oob"
check_is_fitted(self)
X = self._validate_X_predict(X)
y = np.asarray(y)
if y.ndim == 1:
y = y.reshape(-1, 1)

n_samples, n_features = X.shape
max_n_classes = self.estimators_[0].tree_.max_n_classes
results = Parallel(
n_jobs=self.n_jobs, prefer="threads", return_as="generator_unordered"
)(
delayed(
self._compute_unbiased_feature_importance_and_oob_predictions_per_tree
)(tree, X, y, method, n_samples)
for tree in self.estimators_
if tree.tree_.node_count > 1
)

importances = np.zeros(n_features, dtype=np.float64)
oob_pred = np.zeros(
(n_samples, max_n_classes, self.n_outputs_), dtype=np.float64
)
n_oob_pred = np.zeros((n_samples, self.n_outputs_), dtype=np.intp)

for importances_i, oob_pred_i, n_oob_pred_i in results:
oob_pred += oob_pred_i
n_oob_pred += n_oob_pred_i
importances += importances_i

importances /= self.n_estimators

for k in range(self.n_outputs_):
if (n_oob_pred == 0).any():
warn(
(
"Some inputs do not have OOB scores. This probably means "
"too few trees were used to compute any reliable OOB "
"estimates."
),
UserWarning,
)
n_oob_pred[n_oob_pred == 0] = 1
oob_pred[..., k] /= n_oob_pred[..., [k]]

if not importances.any():
return np.zeros(self.n_features_in_, dtype=np.float64), oob_pred
return importances / importances.sum(), oob_pred

def _get_estimators_indices(self):
# Get drawn indices along both sample and feature axes
for tree in self.estimators_:
Expand Down Expand Up @@ -814,18 +896,57 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
scoring_function : callable, default=None
Scoring function for OOB score. Defaults to `accuracy_score`.
"""
self.oob_decision_function_ = super()._compute_oob_predictions(X, y)
if self.oob_decision_function_.shape[-1] == 1:
# drop the n_outputs axis if there is a single output
self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)

if scoring_function is None:
scoring_function = accuracy_score

ufi_feature_importances, self.oob_decision_function_ = (
self._compute_unbiased_feature_importance_and_oob_predictions(
X, y, method="ufi"
)
)
mdi_oob_feature_importances, _ = (
self._compute_unbiased_feature_importance_and_oob_predictions(
X, y, method="mdi_oob"
)
)
if self.criterion == "gini":
self._ufi_feature_importances_ = ufi_feature_importances
self._mdi_oob_feature_importances_ = mdi_oob_feature_importances
elif self.criterion in ["log_loss", "entropy"]:
self._ufi_feature_importances_ = ufi_feature_importances
# mdi_oob does not support entropy yet

if self.oob_decision_function_.shape[-1] == 1:
# drop the n_outputs axis if there is a single output
self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)

self.oob_score_ = scoring_function(
y, np.argmax(self.oob_decision_function_, axis=1)
)

@property
def ufi_feature_importances_(self):
check_is_fitted(self)
if self.criterion in ["gini", "log_loss", "entropy"]:
return self._ufi_feature_importances_
else:
raise AttributeError(
"ufi feature importance only available for"
" classification with split criterion 'gini', 'log_loss' or 'entropy'."
)

@property
def mdi_oob_feature_importances_(self):
check_is_fitted(self)
if self.criterion != "gini":
raise AttributeError(
"mdi_oob feature importance only available for"
" classification with split criterion 'gini'"
)
else:
return self._mdi_oob_feature_importances_

def _validate_y_class_weight(self, y):
check_classification_targets(y)

Expand Down Expand Up @@ -1121,16 +1242,54 @@ def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
scoring_function : callable, default=None
Scoring function for OOB score. Defaults to `r2_score`.
"""
self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1)
if scoring_function is None:
scoring_function = r2_score

ufi_feature_importances, self.oob_prediction_ = (
self._compute_unbiased_feature_importance_and_oob_predictions(
X, y, method="ufi"
)
)
mdi_oob_feature_importances, _ = (
self._compute_unbiased_feature_importance_and_oob_predictions(
X, y, method="mdi_oob"
)
)
if self.criterion == "squared_error":
self._ufi_feature_importances = ufi_feature_importances
self._mdi_oob_feature_importances = mdi_oob_feature_importances

if self.oob_prediction_.shape[-1] == 1:
# drop the n_outputs axis if there is a single output
self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1)

if scoring_function is None:
scoring_function = r2_score
# Drop the n_classes axis of size 1 in regression
self.oob_prediction_ = self.oob_prediction_.squeeze(axis=1)

self.oob_score_ = scoring_function(y, self.oob_prediction_)

@property
def ufi_feature_importances_(self):
check_is_fitted(self)
if self.criterion != "squared_error":
raise AttributeError(
"Unbiased feature importance only available for"
" regression with split criterion MSE"
)
else:
return self._ufi_feature_importances

@property
def mdi_oob_feature_importances_(self):
check_is_fitted(self)
if self.criterion != "squared_error":
raise AttributeError(
"Unbiased feature importance only available for"
" regression with split criterion MSE"
)
else:
return self._mdi_oob_feature_importances

def _compute_partial_dependence_recursion(self, grid, target_features):
"""Fast partial dependence computation.

Expand Down
Loading
Loading