scikit-learn · MaiRajborirug · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020
diff --git a/benchmarks/bench_lof_if.py b/benchmarks/bench_lof_if.py
@@ -0,0 +1,106 @@
+"""
+=====================
+LOF and IF benchmarks
+=====================
+A test of LocalOutlierFactor (LOF) and IsolationForest (IF) on classical 
+anomaly detection datasets. Note that LocalOutlierFactor is not meant to 
+predict on a test set and its performance is assessed in an outlier detection 
+context:
+1. The model is trained on the whole dataset which is assumed to contain
+outliers.
+2. The ROC curve is computed on the same dataset using the knowledge of the
+labels.
+In this context there is no need to shuffle the dataset because the model
+is trained and tested on the whole dataset. The randomness of this benchmark
+is only caused by the random selection of anomalies in the SA dataset.
+"""
+
+from time import time
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.neighbors import LocalOutlierFactor
+from sklearn.ensemble import IsolationForest
+from sklearn.metrics import roc_curve, auc
+from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
+from sklearn.preprocessing import LabelBinarizer
+
+print(__doc__)
+
+random_state = 1  # to control the random selection of anomalies in SA
+
+# datasets
+datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
+# outlier detection models
+models =[('LOF', LocalOutlierFactor(n_neighbors=20, contamination='auto')),
+         ('IF', IsolationForest(n_jobs=-1, random_state=random_state,
+                                behaviour="new", contamination='auto'))]
+
+plt.figure(figsize=(5,len(datasets)*3))
+for dataset_idx, dataset_name in enumerate(datasets):
+    plt.subplot(len(datasets), 1, dataset_idx+1)
+    # loading and vectorization
+    print('loading data: ', str(dataset_idx+1))
+    if dataset_name in ['http', 'smtp', 'SA', 'SF']:
+        dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
+                                 random_state=random_state)
+        X = dataset.data
+        y = dataset.target
+
+    if dataset_name == 'forestcover':
+        dataset = fetch_covtype()
+        X = dataset.data
+        y = dataset.target
+        # normal data are those with attribute 2
+        # abnormal those with attribute 4
+        s = (y == 2) + (y == 4)
+        X = X[s, :]
+        y = y[s]
+        y = (y != 2).astype(int)
+
+    print('vectorizing data')
+
+    if dataset_name == 'SF':
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        X = np.c_[X[:, :1], x1, X[:, 2:]]
+        y = (y != b'normal.').astype(int)
+
+    if dataset_name == 'SA':
+        lb = LabelBinarizer()
+        x1 = lb.fit_transform(X[:, 1].astype(str))
+        x2 = lb.fit_transform(X[:, 2].astype(str))
+        x3 = lb.fit_transform(X[:, 3].astype(str))
+        X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
+        y = (y != b'normal.').astype(int)
+
+    if dataset_name == 'http' or dataset_name == 'smtp':
+        y = (y != b'normal.').astype(int)
+
+    X = X.astype(float)
+
+    print('Estimator processing...')
+    for model_name, model in models:
+        tstart = time()
+        model.fit(X)
+        fit_time = time() - tstart
+        if model_name == 'LOF':
+            scoring = -model.negative_outlier_factor_  # the lower, the more normal
+        if model_name == 'IF':
+            scoring = -model.fit(X).decision_function(X)
+
+        fpr, tpr, thresholds = roc_curve(y, scoring)
+        AUC = auc(fpr, tpr)
+        plt.plot(fpr, tpr, lw=1,
+                 label=(model_name, ': ROC for %s (area = %0.3f, train-time: %0.2fs)'
+                        % (dataset_name, AUC, fit_time)))
+
+    plt.xlim([-0.05, 1.05])
+    plt.ylim([-0.05, 1.05])
+    plt.legend(loc="lower right")
+    if dataset_idx ==0:
+        plt.title('Receiver operating characteristic')
+    if dataset_idx ==len(datasets)-1:
+        plt.xlabel('False Positive Rate')
+        plt.ylabel('True Positive Rate')
+
+plt.show()
diff --git a/examples/plot_anomaly_comparison.py b/examples/plot_anomaly_comparison.py
@@ -3,9 +3,10 @@
 Comparing anomaly detection algorithms for outlier detection on toy datasets
 ============================================================================
 
-This example shows characteristics of different anomaly detection algorithms
-on 2D datasets. Datasets contain one or two modes (regions of high density)
-to illustrate the ability of algorithms to cope with multimodal data.
+This example shows characteristics and compares the performance of different 
+anomaly detection algorithms on 2D datasets. Datasets contain one or two 
+modes (regions of high density) to illustrate the ability of algorithms to 
+cope with multimodal data.
 
 For each dataset, 15% of samples are generated as random uniform noise. This
 proportion is the value given to the nu parameter of the OneClassSVM and the
@@ -14,6 +15,13 @@
 except for Local Outlier Factor (LOF) as it has no predict method to be applied
 on new data when it is used for outlier detection.
 
+There are two main performance measurement parameters in this example. First, 
+`sklearn.metrics.accuracy_score` measures the matching between ground truth 
+labels and predicted labels. Second, `sklearn.metrics.roc_auc_score` computes
+the area under receiver (AUC) operating characteristic (ROC) curve. 
+The last column of the examples visualize the ROC curves except LOF for 
+the reason above.
+
 The :class:`sklearn.svm.OneClassSVM` is known to be sensitive to outliers and
 thus does not perform very well for outlier detection. This estimator is best
 suited for novelty detection when the training set is not contaminated by
@@ -51,10 +59,6 @@
 the problem is completely unsupervised so model selection can be a challenge.
 """
 
-# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
-#         Albert Thomas <albert.thomas@telecom-paristech.fr>
-# License: BSD 3 clause
-
 import time
 
 import numpy as np
@@ -67,6 +71,10 @@
 from sklearn.ensemble import IsolationForest
 from sklearn.neighbors import LocalOutlierFactor
 
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import roc_auc_score
+from sklearn.metrics import roc_curve
+
 print(__doc__)
 
 matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
@@ -81,9 +89,9 @@
 anomaly_algorithms = [
     ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
     ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
-                                      gamma=0.1)),
+                                      gamma="scale")),
     ("Isolation Forest", IsolationForest(contamination=outliers_fraction,
-                                         random_state=42)),
+                                         behaviour = "new", random_state=42)),
     ("Local Outlier Factor", LocalOutlierFactor(
         n_neighbors=35, contamination=outliers_fraction))]
 
@@ -96,39 +104,61 @@
                **blobs_params)[0],
     make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
                **blobs_params)[0],
-    4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
+    4. * (make_moons(n_samples=n_inliers, noise=.05, random_state=0)[0] -
           np.array([0.5, 0.25])),
-    14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5)]
+    14. * (np.random.RandomState(42).rand(n_inliers, 2) - 0.5)]
+
+# label the ground truth
+y_true = np.concatenate([np.ones(n_inliers), -np.ones(n_outliers)], axis=0)
 
 # Compare given classifiers under given settings
 xx, yy = np.meshgrid(np.linspace(-7, 7, 150),
                      np.linspace(-7, 7, 150))
 
-plt.figure(figsize=(len(anomaly_algorithms) * 2 + 3, 12.5))
+plt.figure(figsize=((len(anomaly_algorithms)+1) * 2.2+1 + 3, len(datasets)*2.2+1))
 plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                     hspace=.01)
 
 plot_num = 1
 rng = np.random.RandomState(42)
 
 for i_dataset, X in enumerate(datasets):
-    # Add outliers
     X = np.concatenate([X, rng.uniform(low=-6, high=6,
                        size=(n_outliers, 2))], axis=0)
-
+
+    # list of AUC and ROC
+    list_AUC = []
+    list_fpr = []
+    list_tpr = []
+    list_thresh = []
+
+    algo_index = 0
     for name, algorithm in anomaly_algorithms:
         t0 = time.time()
         algorithm.fit(X)
         t1 = time.time()
-        plt.subplot(len(datasets), len(anomaly_algorithms), plot_num)
-        if i_dataset == 0:
-            plt.title(name, size=18)
-
+
         # fit the data and tag outliers
         if name == "Local Outlier Factor":
             y_pred = algorithm.fit_predict(X)
         else:
             y_pred = algorithm.fit(X).predict(X)
+
+            # store ROC plot 
+            probas_ = algorithm.fit(X).decision_function(X) 
+            ## LOF does not implement decision_function
+            AUC = roc_auc_score(y_true, probas_) # AUC
+            fpr, tpr, thresholds = roc_curve(y_true, probas_)
+            thresh_index = np.where(abs(thresholds) == min(abs(thresholds)))[0][0]
+            list_AUC.append(AUC)
+            list_fpr.append(fpr)
+            list_tpr.append(tpr)
+            list_thresh.append(thresh_index)   
+
+        acc = accuracy_score(y_true, y_pred) # acuracy
+        plt.subplot(len(datasets), len(anomaly_algorithms)+1, plot_num)
+        if i_dataset == 0:
+            plt.title(str(algo_index + 1) + ") " + name, size=15, weight="bold")
 
         # plot the levels lines and the points
         if name != "Local Outlier Factor":  # LOF does not implement predict
@@ -143,9 +173,47 @@
         plt.ylim(-7, 7)
         plt.xticks(())
         plt.yticks(())
+
         plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                  transform=plt.gca().transAxes, size=15,
                  horizontalalignment='right')
+
+        # show prediction accuracy
+        bbox_props = dict(boxstyle="square", fc="white", ec="none", alpha =0.8)
+        plt.text(.03, .89, ("acc %.3f" % acc).lstrip("0"),
+                 transform=plt.gca().transAxes, size=15,
+                 horizontalalignment="left", bbox = bbox_props)        
         plot_num += 1
-
+        algo_index += 1
+
+    # plot the ROC curves and show AUC scores
+    plt.subplot(len(datasets), len(anomaly_algorithms) + 1, plot_num)
+
+    if i_dataset == 0:
+        plt.title("ROC", size=15, color="black", weight="bold")
+
+        # lebel the decision_function's thresholds
+        plt.scatter([], [], marker="x", color="black", label="thresholds")
+
+    for algo_index in range(len(anomaly_algorithms)-1): # exclude LOF
+
+        if i_dataset == 0:
+            plt.plot(list_fpr[algo_index], list_tpr[algo_index],
+                label="algo " + str(algo_index + 1)+ ")"
+                + (" AUC %.2f" % list_AUC[algo_index]).lstrip("0"))
+        else:
+            plt.plot(list_fpr[algo_index], list_tpr[algo_index],
+                label= str(algo_index + 1)+ ")"
+                + (" %.2f" % list_AUC[algo_index]).lstrip("0"))
+
+        plt.scatter(
+        list_fpr[algo_index][list_thresh[algo_index]],
+        list_tpr[algo_index][list_thresh[algo_index]],
+        s=40, marker="x", color = 'black')
+
+    plt.plot(np.array([0, 1]), np.array([0, 1]), linestyle="--", color="black")
+    plt.legend()
+    plt.tick_params(labelleft = False, labelbottom = False, direction  = "in")
+    plot_num += 1          
+
 plt.show()