Skip to content

[WIP] Performance comparison (ROC) plots for anomaly detection methods #16378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 15 commits into from
Closed
106 changes: 106 additions & 0 deletions benchmarks/bench_lof_if.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
=====================
LOF and IF benchmarks
=====================
A test of LocalOutlierFactor (LOF) and IsolationForest (IF) on classical
anomaly detection datasets. Note that LocalOutlierFactor is not meant to
predict on a test set and its performance is assessed in an outlier detection
context:
1. The model is trained on the whole dataset which is assumed to contain
outliers.
2. The ROC curve is computed on the same dataset using the knowledge of the
labels.
In this context there is no need to shuffle the dataset because the model
is trained and tested on the whole dataset. The randomness of this benchmark
is only caused by the random selection of anomalies in the SA dataset.
"""

from time import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
from sklearn.preprocessing import LabelBinarizer

print(__doc__)

random_state = 1 # to control the random selection of anomalies in SA

# datasets
datasets = ['http', 'smtp', 'SA', 'SF', 'forestcover']
# outlier detection models
models =[('LOF', LocalOutlierFactor(n_neighbors=20, contamination='auto')),
('IF', IsolationForest(n_jobs=-1, random_state=random_state,
behaviour="new", contamination='auto'))]

plt.figure(figsize=(5,len(datasets)*3))
for dataset_idx, dataset_name in enumerate(datasets):
plt.subplot(len(datasets), 1, dataset_idx+1)
# loading and vectorization
print('loading data: ', str(dataset_idx+1))
if dataset_name in ['http', 'smtp', 'SA', 'SF']:
dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
random_state=random_state)
X = dataset.data
y = dataset.target

if dataset_name == 'forestcover':
dataset = fetch_covtype()
X = dataset.data
y = dataset.target
# normal data are those with attribute 2
# abnormal those with attribute 4
s = (y == 2) + (y == 4)
X = X[s, :]
y = y[s]
y = (y != 2).astype(int)

print('vectorizing data')

if dataset_name == 'SF':
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
X = np.c_[X[:, :1], x1, X[:, 2:]]
y = (y != b'normal.').astype(int)

if dataset_name == 'SA':
lb = LabelBinarizer()
x1 = lb.fit_transform(X[:, 1].astype(str))
x2 = lb.fit_transform(X[:, 2].astype(str))
x3 = lb.fit_transform(X[:, 3].astype(str))
X = np.c_[X[:, :1], x1, x2, x3, X[:, 4:]]
y = (y != b'normal.').astype(int)

if dataset_name == 'http' or dataset_name == 'smtp':
y = (y != b'normal.').astype(int)

X = X.astype(float)

print('Estimator processing...')
for model_name, model in models:
tstart = time()
model.fit(X)
fit_time = time() - tstart
if model_name == 'LOF':
scoring = -model.negative_outlier_factor_ # the lower, the more normal
if model_name == 'IF':
scoring = -model.fit(X).decision_function(X)

fpr, tpr, thresholds = roc_curve(y, scoring)
AUC = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1,
label=(model_name, ': ROC for %s (area = %0.3f, train-time: %0.2fs)'
% (dataset_name, AUC, fit_time)))

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.legend(loc="lower right")
if dataset_idx ==0:
plt.title('Receiver operating characteristic')
if dataset_idx ==len(datasets)-1:
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.show()
106 changes: 87 additions & 19 deletions examples/plot_anomaly_comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
Comparing anomaly detection algorithms for outlier detection on toy datasets
============================================================================

This example shows characteristics of different anomaly detection algorithms
on 2D datasets. Datasets contain one or two modes (regions of high density)
to illustrate the ability of algorithms to cope with multimodal data.
This example shows characteristics and compares the performance of different
anomaly detection algorithms on 2D datasets. Datasets contain one or two
modes (regions of high density) to illustrate the ability of algorithms to
cope with multimodal data.

For each dataset, 15% of samples are generated as random uniform noise. This
proportion is the value given to the nu parameter of the OneClassSVM and the
Expand All @@ -14,6 +15,13 @@
except for Local Outlier Factor (LOF) as it has no predict method to be applied
on new data when it is used for outlier detection.

There are two main performance measurement parameters in this example. First,
`sklearn.metrics.accuracy_score` measures the matching between ground truth
labels and predicted labels. Second, `sklearn.metrics.roc_auc_score` computes
the area under receiver (AUC) operating characteristic (ROC) curve.
The last column of the examples visualize the ROC curves except LOF for
the reason above.

The :class:`sklearn.svm.OneClassSVM` is known to be sensitive to outliers and
thus does not perform very well for outlier detection. This estimator is best
suited for novelty detection when the training set is not contaminated by
Expand Down Expand Up @@ -51,10 +59,6 @@
the problem is completely unsupervised so model selection can be a challenge.
"""

# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Albert Thomas <albert.thomas@telecom-paristech.fr>
# License: BSD 3 clause

import time

import numpy as np
Expand All @@ -67,6 +71,10 @@
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

print(__doc__)

matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
Expand All @@ -81,9 +89,9 @@
anomaly_algorithms = [
("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
gamma=0.1)),
gamma="scale")),
("Isolation Forest", IsolationForest(contamination=outliers_fraction,
random_state=42)),
behaviour = "new", random_state=42)),
("Local Outlier Factor", LocalOutlierFactor(
n_neighbors=35, contamination=outliers_fraction))]

Expand All @@ -96,39 +104,61 @@
**blobs_params)[0],
make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
**blobs_params)[0],
4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
4. * (make_moons(n_samples=n_inliers, noise=.05, random_state=0)[0] -
np.array([0.5, 0.25])),
14. * (np.random.RandomState(42).rand(n_samples, 2) - 0.5)]
14. * (np.random.RandomState(42).rand(n_inliers, 2) - 0.5)]

# label the ground truth
y_true = np.concatenate([np.ones(n_inliers), -np.ones(n_outliers)], axis=0)

# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 150),
np.linspace(-7, 7, 150))

plt.figure(figsize=(len(anomaly_algorithms) * 2 + 3, 12.5))
plt.figure(figsize=((len(anomaly_algorithms)+1) * 2.2+1 + 3, len(datasets)*2.2+1))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)

plot_num = 1
rng = np.random.RandomState(42)

for i_dataset, X in enumerate(datasets):
# Add outliers
X = np.concatenate([X, rng.uniform(low=-6, high=6,
size=(n_outliers, 2))], axis=0)


# list of AUC and ROC
list_AUC = []
list_fpr = []
list_tpr = []
list_thresh = []

algo_index = 0
for name, algorithm in anomaly_algorithms:
t0 = time.time()
algorithm.fit(X)
t1 = time.time()
plt.subplot(len(datasets), len(anomaly_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)


# fit the data and tag outliers
if name == "Local Outlier Factor":
y_pred = algorithm.fit_predict(X)
else:
y_pred = algorithm.fit(X).predict(X)

# store ROC plot
probas_ = algorithm.fit(X).decision_function(X)
## LOF does not implement decision_function
AUC = roc_auc_score(y_true, probas_) # AUC
fpr, tpr, thresholds = roc_curve(y_true, probas_)
thresh_index = np.where(abs(thresholds) == min(abs(thresholds)))[0][0]
list_AUC.append(AUC)
list_fpr.append(fpr)
list_tpr.append(tpr)
list_thresh.append(thresh_index)

acc = accuracy_score(y_true, y_pred) # acuracy
plt.subplot(len(datasets), len(anomaly_algorithms)+1, plot_num)
if i_dataset == 0:
plt.title(str(algo_index + 1) + ") " + name, size=15, weight="bold")

# plot the levels lines and the points
if name != "Local Outlier Factor": # LOF does not implement predict
Expand All @@ -143,9 +173,47 @@
plt.ylim(-7, 7)
plt.xticks(())
plt.yticks(())

plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')

# show prediction accuracy
bbox_props = dict(boxstyle="square", fc="white", ec="none", alpha =0.8)
plt.text(.03, .89, ("acc %.3f" % acc).lstrip("0"),
transform=plt.gca().transAxes, size=15,
horizontalalignment="left", bbox = bbox_props)
plot_num += 1

algo_index += 1

# plot the ROC curves and show AUC scores
plt.subplot(len(datasets), len(anomaly_algorithms) + 1, plot_num)

if i_dataset == 0:
plt.title("ROC", size=15, color="black", weight="bold")

# lebel the decision_function's thresholds
plt.scatter([], [], marker="x", color="black", label="thresholds")

for algo_index in range(len(anomaly_algorithms)-1): # exclude LOF

if i_dataset == 0:
plt.plot(list_fpr[algo_index], list_tpr[algo_index],
label="algo " + str(algo_index + 1)+ ")"
+ (" AUC %.2f" % list_AUC[algo_index]).lstrip("0"))
else:
plt.plot(list_fpr[algo_index], list_tpr[algo_index],
label= str(algo_index + 1)+ ")"
+ (" %.2f" % list_AUC[algo_index]).lstrip("0"))

plt.scatter(
list_fpr[algo_index][list_thresh[algo_index]],
list_tpr[algo_index][list_thresh[algo_index]],
s=40, marker="x", color = 'black')

plt.plot(np.array([0, 1]), np.array([0, 1]), linestyle="--", color="black")
plt.legend()
plt.tick_params(labelleft = False, labelbottom = False, direction = "in")
plot_num += 1

plt.show()