Skip to content

[MRG+1] BUG: Fix fetch_kddcup99 for Python 3 #5946

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 13, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions sklearn/datasets/kddcup99.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

from .base import get_data_home
from .base import Bunch
from ..externals import joblib
from ..externals import joblib, six
from ..utils import check_random_state
from ..utils import shuffle as shuffle_method

Expand All @@ -40,8 +40,8 @@


def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
percent10=False):
"""Load and return the kddcup 99 dataset (regression).
percent10=True, download_if_missing=True):
"""Load and return the kddcup 99 dataset (classification).

The KDD Cup '99 dataset was created by processing the tcpdump portions
of the 1998 DARPA Intrusion Detection System (IDS) Evaluation dataset,
Expand Down Expand Up @@ -93,7 +93,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,

================ ==========================================
Samples total 699691
Dimensionality 40
Dimensionality 4
Features discrete (int) or continuous (float)
Targets str, 'normal.' or name of the anomaly type
================ ==========================================
Expand All @@ -102,7 +102,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,

================ ==========================================
Samples total 619052
Dimensionality 39
Dimensionality 3
Features discrete (int) or continuous (float)
Targets str, 'normal.' or name of the anomaly type
================ ==========================================
Expand All @@ -111,7 +111,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,

================ ==========================================
Samples total 95373
Dimensionality 39
Dimensionality 3
Features discrete (int) or continuous (float)
Targets str, 'normal.' or name of the anomaly type
================ ==========================================
Expand All @@ -135,6 +135,10 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
percent10 : bool, default=False
Whether to load only 10 percent of the data.

download_if_missing : bool, default=True
If False, raise a IOError if the data is not locally available
instead of trying to download the data from the source site.

Returns
-------
data : Bunch
Expand All @@ -153,13 +157,14 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
Intrusions in Unlabeled Data (2002) by Eleazar Eskin, Andrew Arnold,
Michael Prerau, Leonid Portnoy, Sal Stolfo
"""
kddcup99 = _fetch_brute_kddcup99(shuffle=shuffle, percent10=percent10)
kddcup99 = _fetch_brute_kddcup99(shuffle=shuffle, percent10=percent10,
download_if_missing=download_if_missing)

data = kddcup99.data
target = kddcup99.target

if subset == 'SA':
s = target == 'normal.'
s = target == b'normal.'
t = np.logical_not(s)
normal_samples = data[s, :]
normal_targets = target[s]
Expand Down Expand Up @@ -187,13 +192,13 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
data[:, 5] = np.log((data[:, 5] + 0.1).astype(float))

if subset == 'http':
s = data[:, 2] == 'http'
s = data[:, 2] == b'http'
data = data[s]
target = target[s]
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]

if subset == 'smtp':
s = data[:, 2] == 'smtp'
s = data[:, 2] == b'smtp'
data = data[s]
target = target[s]
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
Expand Down Expand Up @@ -320,6 +325,8 @@ def _fetch_brute_kddcup99(subset=None, data_home=None,
file_ = GzipFile(fileobj=f, mode='r')
Xy = []
for line in file_.readlines():
if six.PY3:
line = line.decode()
Xy.append(line.replace('\n', '').split(','))
file_.close()
print('extraction done')
Expand Down
41 changes: 41 additions & 0 deletions sklearn/datasets/tests/test_kddcup99.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Test kddcup99 loader. Only 'percent10' mode is tested, as the full data
is too big to use in unit-testing.

The test is skipped if the data wasn't previously fetched and saved to
scikit-learn data folder.
"""

import errno
from sklearn.datasets import fetch_kddcup99
from sklearn.utils.testing import assert_equal, SkipTest


def test_percent10():
try:
data = fetch_kddcup99(download_if_missing=False)
except IOError as e:
if e.errno == errno.ENOENT:
raise SkipTest("kddcup99 dataset can not be loaded.")

assert_equal(data.data.shape, (494021, 41))
assert_equal(data.target.shape, (494021,))

data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
assert_equal(data.data.shape, data_shuffled.data.shape)
assert_equal(data.target.shape, data_shuffled.target.shape)

data = fetch_kddcup99('SA')
assert_equal(data.data.shape, (100655, 41))
assert_equal(data.target.shape, (100655,))

data = fetch_kddcup99('SF')
assert_equal(data.data.shape, (73237, 4))
assert_equal(data.target.shape, (73237,))

data = fetch_kddcup99('http')
assert_equal(data.data.shape, (58725, 3))
assert_equal(data.target.shape, (58725,))

data = fetch_kddcup99('smtp')
assert_equal(data.data.shape, (9571, 3))
assert_equal(data.target.shape, (9571,))