scikit-learn · jeremiedbb · Apr 7, 2022 · May 14, 2020 · May 28, 2020 · May 28, 2020
diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst
@@ -131,6 +131,12 @@ sections hereunder.
     :class:`neighbors.LocalOutlierFactor` and
     :class:`covariance.EllipticEnvelope`.
 
+  * See :ref:`sphx_glr_auto_examples_miscellaneous_plot_outlier_detection_bench.py`
+    for an example showing how to evaluate outlier detection estimators,
+    the :class:`neighbors.LocalOutlierFactor` and the
+    :class:`ensemble.IsolationForest`, using ROC curves from
+    :class:`metrics.RocCurveDisplay`.
+
 Novelty Detection
 =================
 
@@ -310,6 +316,7 @@ allows you to add more trees to an already fitted model::
     * Liu, Fei Tony, Ting, Kai Ming and Zhou, Zhi-Hua. "Isolation forest."
       Data Mining, 2008. ICDM'08. Eighth IEEE International Conference on.
 
+.. _local_outlier_factor:
 
 Local Outlier Factor
 --------------------

diff --git a/examples/miscellaneous/plot_outlier_detection_bench.py b/examples/miscellaneous/plot_outlier_detection_bench.py
@@ -0,0 +1,193 @@
+"""
+==========================================
+Evaluation of outlier detection estimators
+==========================================
+
+This example benchmarks outlier detection algorithms, :ref:`local_outlier_factor`
+(LOF) and :ref:`isolation_forest` (IForest), using ROC curves on
+classical anomaly detection datasets. The algorithm performance
+is assessed in an outlier detection context:
+
+1. The algorithms are trained on the whole dataset which is assumed to
+contain outliers.
+
+2. The ROC curve from :class:`~sklearn.metrics.RocCurveDisplay` is computed
+on the same dataset using the knowledge of the labels.
+
+"""
+
+# Author: Pharuj Rajborirug <pharuj.ra@kmitl.ac.th>
+# License: BSD 3 clause
+
+print(__doc__)
+
+# %%
+# Define a data preprocessing function
+# ----------------------------------
+#
+# The example uses real-world datasets available in
+# :class:`sklearn.datasets` and the sample size of some datasets is reduced
+# to speed up computation. After the data preprocessing, the datasets' targets
+# will have two classes, 0 representing inliers and 1 representing outliers.
+# The `preprocess_dataset` function returns data and target.
+
+import numpy as np
+from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
+from sklearn.preprocessing import LabelBinarizer
+import pandas as pd
+
+rng = np.random.RandomState(42)
+
+
+def preprocess_dataset(dataset_name):
+
+    # loading and vectorization
+    print(f"Loading {dataset_name} data")
+    if dataset_name in ["http", "smtp", "SA", "SF"]:
+        dataset = fetch_kddcup99(subset=dataset_name, percent10=True, random_state=rng)
+        X = dataset.data
+        y = dataset.target
+        lb = LabelBinarizer()
+
+        if dataset_name == "SF":
+            idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
+            X = X[idx]  # reduce the sample size
+            y = y[idx]
+            x1 = lb.fit_transform(X[:, 1].astype(str))
+            X = np.c_[X[:, :1], x1, X[:, 2:]]
+        elif dataset_name == "SA":
+            idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
+            X = X[idx]  # reduce the sample size
+            y = y[idx]
+            x1 = lb.fit_transform(X[:, 1].astype(str))
+            x2 = lb.fit_transform(X[:, 2].astype(str))
+            x3 = lb.fit_transform(X[:, 3].astype(str))
+            X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
+        y = (y != b"normal.").astype(int)
+    if dataset_name == "forestcover":
+        dataset = fetch_covtype()
+        X = dataset.data
+        y = dataset.target
+        idx = rng.choice(X.shape[0], int(X.shape[0] * 0.1), replace=False)
+        X = X[idx]  # reduce the sample size
+        y = y[idx]
+
+        # inliers are those with attribute 2
+        # outliers are those with attribute 4
+        s = (y == 2) + (y == 4)
+        X = X[s, :]
+        y = y[s]
+        y = (y != 2).astype(int)
+    if dataset_name in ["glass", "wdbc", "cardiotocography"]:
+        dataset = fetch_openml(name=dataset_name, version=1, as_frame=False)
+        X = dataset.data
+        y = dataset.target
+
+        if dataset_name == "glass":
+            s = y == "tableware"
+            y = s.astype(int)
+        if dataset_name == "wdbc":
+            s = y == "2"
+            y = s.astype(int)
+            X_mal, y_mal = X[s], y[s]
+            X_ben, y_ben = X[~s], y[~s]
+
+            # downsampled to 39 points (9.8% outliers)
+            idx = rng.choice(y_mal.shape[0], 39, replace=False)
+            X_mal2 = X_mal[idx]
+            y_mal2 = y_mal[idx]
+            X = np.concatenate((X_ben, X_mal2), axis=0)
+            y = np.concatenate((y_ben, y_mal2), axis=0)
+        if dataset_name == "cardiotocography":
+            s = y == "3"
+            y = s.astype(int)
+    # 0 represents inliers, and 1 represents outliers
+    y = pd.Series(y, dtype="category")
+    return (X, y)
+
+
+# %%
+# Define an outlier prediction function
+# -------------------------------------
+# There is no particular reason to choose algorithms
+# :class:`~sklearn.neighbors.LocalOutlierFactor` and
+# :class:`~sklearn.ensemble.IsolationForest`. The goal is to show that
+# different algorithm performs well on different datasets. The following
+# `compute_prediction` function returns average outlier score of X.
+
+
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.ensemble import IsolationForest
+
+
+def compute_prediction(X, model_name):
+
+    print(f"Computing {model_name} prediction...")
+    if model_name == "LOF":
+        clf = LocalOutlierFactor(n_neighbors=20, contamination="auto")
+        clf.fit(X)
+        y_pred = clf.negative_outlier_factor_
+    if model_name == "IForest":
+        clf = IsolationForest(random_state=rng, contamination="auto")
+        y_pred = clf.fit(X).decision_function(X)
+    return y_pred
+
+
+# %%
+# Plot and interpret results
+# --------------------------
+#
+# The algorithm performance relates to how good the true positive rate (TPR)
+# is at low value of the false positive rate (FPR). The best algorithms
+# have the curve on the top-left of the plot and the area under curve (AUC)
+# close to 1. The diagonal dashed line represents a random classification
+# of outliers and inliers.
+
+
+import math
+import matplotlib.pyplot as plt
+from sklearn.metrics import RocCurveDisplay
+
+datasets_name = [
+    "http",
+    "smtp",
+    "SA",
+    "SF",
+    "forestcover",
+    "glass",
+    "wdbc",
+    "cardiotocography",
+]
+
+models_name = [
+    "LOF",
+    "IForest",
+]
+
+# plotting parameters
+cols = 2
+linewidth = 1
+pos_label = 0  # mean 0 belongs to positive class
+rows = math.ceil(len(datasets_name) / cols)
+
+fig, axs = plt.subplots(rows, cols, figsize=(10, rows * 3))
+
+for i, dataset_name in enumerate(datasets_name):
+    (X, y) = preprocess_dataset(dataset_name=dataset_name)
+
+    for model_name in models_name:
+        y_pred = compute_prediction(X, model_name=model_name)
+        display = RocCurveDisplay.from_predictions(
+            y,
+            y_pred,
+            pos_label=pos_label,
+            name=model_name,
+            linewidth=linewidth,
+            ax=axs[i // cols, i % cols],
+        )
+    axs[i // cols, i % cols].plot([0, 1], [0, 1], linewidth=linewidth, linestyle=":")
+    axs[i // cols, i % cols].set_title(dataset_name)
+    axs[i // cols, i % cols].set_xlabel("False Positive Rate")
+    axs[i // cols, i % cols].set_ylabel("True Positive Rate")
+plt.tight_layout(pad=2.0)  # spacing between subplots
+plt.show()