diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index 5d8ab250e8e65..b0475b809ecb1 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -131,6 +131,12 @@ sections hereunder. :class:`neighbors.LocalOutlierFactor` and :class:`covariance.EllipticEnvelope`. + * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_outlier_detection_bench.py` + for an example showing how to evaluate outlier detection estimators, + the :class:`neighbors.LocalOutlierFactor` and the + :class:`ensemble.IsolationForest`, using ROC curves from + :class:`metrics.RocCurveDisplay`. + Novelty Detection ================= @@ -310,6 +316,7 @@ allows you to add more trees to an already fitted model:: * Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest." Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on. +.. _local_outlier_factor: Local Outlier Factor -------------------- diff --git a/examples/miscellaneous/plot_outlier_detection_bench.py b/examples/miscellaneous/plot_outlier_detection_bench.py new file mode 100644 index 0000000000000..f68b6f747208f --- /dev/null +++ b/examples/miscellaneous/plot_outlier_detection_bench.py @@ -0,0 +1,193 @@ +""" +========================================== +Evaluation of outlier detection estimators +========================================== + +This example benchmarks outlier detection algorithms, :ref:`local_outlier_factor` +(LOF) and :ref:`isolation_forest` (IForest), using ROC curves on +classical anomaly detection datasets. The algorithm performance +is assessed in an outlier detection context: + +1. The algorithms are trained on the whole dataset which is assumed to +contain outliers. + +2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed +on the same dataset using the knowledge of the labels. + +""" + +# Author: Pharuj Rajborirug +# License: BSD 3 clause + +print(__doc__) + +# %% +# Define a data preprocessing function +# ---------------------------------- +# +# The example uses real-world datasets available in +# :class:`sklearn.datasets` and the sample size of some datasets is reduced +# to speed up computation. After the data preprocessing, the datasets' targets +# will have two classes, 0 representing inliers and 1 representing outliers. +# The `preprocess_dataset` function returns data and target. + +import numpy as np +from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml +from sklearn.preprocessing import LabelBinarizer +import pandas as pd + +rng = np.random.RandomState(42) + + +def preprocess_dataset(dataset_name): + + # loading and vectorization + print(f"Loading {dataset_name} data") + if dataset_name in ["http", "smtp", "SA", "SF"]: + dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng) + X = dataset.data + y = dataset.target + lb = LabelBinarizer() + + if dataset_name == "SF": + idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) + X = X[idx] # reduce the sample size + y = y[idx] + x1 = lb.fit_transform(X[:, 1].astype(str)) + X = np.c_[X[:, :1], x1, X[:, 2:]] + elif dataset_name == "SA": + idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) + X = X[idx] # reduce the sample size + y = y[idx] + x1 = lb.fit_transform(X[:, 1].astype(str)) + x2 = lb.fit_transform(X[:, 2].astype(str)) + x3 = lb.fit_transform(X[:, 3].astype(str)) + X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]] + y = (y != b"normal.").astype(int) + if dataset_name == "forestcover": + dataset = fetch_covtype() + X = dataset.data + y = dataset.target + idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False) + X = X[idx] # reduce the sample size + y = y[idx] + + # inliers are those with attribute 2 + # outliers are those with attribute 4 + s = (y == 2) + (y == 4) + X = X[s, :] + y = y[s] + y = (y != 2).astype(int) + if dataset_name in ["glass", "wdbc", "cardiotocography"]: + dataset = fetch_openml(name=dataset_name, version=1, as_frame=False) + X = dataset.data + y = dataset.target + + if dataset_name == "glass": + s = y == "tableware" + y = s.astype(int) + if dataset_name == "wdbc": + s = y == "2" + y = s.astype(int) + X_mal, y_mal = X[s], y[s] + X_ben, y_ben = X[~s], y[~s] + + # downsampled to 39 points (9.8% outliers) + idx = rng.choice(y_mal.shape[0], 39, replace=False) + X_mal2 = X_mal[idx] + y_mal2 = y_mal[idx] + X = np.concatenate((X_ben, X_mal2), axis=0) + y = np.concatenate((y_ben, y_mal2), axis=0) + if dataset_name == "cardiotocography": + s = y == "3" + y = s.astype(int) + # 0 represents inliers, and 1 represents outliers + y = pd.Series(y, dtype="category") + return (X, y) + + +# %% +# Define an outlier prediction function +# ------------------------------------- +# There is no particular reason to choose algorithms +# :class:`~sklearn.neighbors.LocalOutlierFactor` and +# :class:`~sklearn.ensemble.IsolationForest`. The goal is to show that +# different algorithm performs well on different datasets. The following +# `compute_prediction` function returns average outlier score of X. + + +from sklearn.neighbors import LocalOutlierFactor +from sklearn.ensemble import IsolationForest + + +def compute_prediction(X, model_name): + + print(f"Computing {model_name} prediction...") + if model_name == "LOF": + clf = LocalOutlierFactor(n_neighbors=20, contamination="auto") + clf.fit(X) + y_pred = clf.negative_outlier_factor_ + if model_name == "IForest": + clf = IsolationForest(random_state=rng, contamination="auto") + y_pred = clf.fit(X).decision_function(X) + return y_pred + + +# %% +# Plot and interpret results +# -------------------------- +# +# The algorithm performance relates to how good the true positive rate (TPR) +# is at low value of the false positive rate (FPR). The best algorithms +# have the curve on the top-left of the plot and the area under curve (AUC) +# close to 1. The diagonal dashed line represents a random classification +# of outliers and inliers. + + +import math +import matplotlib.pyplot as plt +from sklearn.metrics import RocCurveDisplay + +datasets_name = [ + "http", + "smtp", + "SA", + "SF", + "forestcover", + "glass", + "wdbc", + "cardiotocography", +] + +models_name = [ + "LOF", + "IForest", +] + +# plotting parameters +cols = 2 +linewidth = 1 +pos_label = 0 # mean 0 belongs to positive class +rows = math.ceil(len(datasets_name) / cols) + +fig, axs = plt.subplots(rows, cols, figsize=(10, rows * 3)) + +for i, dataset_name in enumerate(datasets_name): + (X, y) = preprocess_dataset(dataset_name=dataset_name) + + for model_name in models_name: + y_pred = compute_prediction(X, model_name=model_name) + display = RocCurveDisplay.from_predictions( + y, + y_pred, + pos_label=pos_label, + name=model_name, + linewidth=linewidth, + ax=axs[i // cols, i % cols], + ) + axs[i // cols, i % cols].plot([0, 1], [0, 1], linewidth=linewidth, linestyle=":") + axs[i // cols, i % cols].set_title(dataset_name) + axs[i // cols, i % cols].set_xlabel("False Positive Rate") + axs[i // cols, i % cols].set_ylabel("True Positive Rate") +plt.tight_layout(pad=2.0) # spacing between subplots +plt.show()