Skip to content

Commit c9e336b

Browse files
author
Nikolay Mayorov
committed
BUG: Fixed comparison with bytes in kddcup.py + test
1 parent 4b8dc27 commit c9e336b

File tree

2 files changed

+61
-8
lines changed

2 files changed

+61
-8
lines changed

sklearn/datasets/kddcup99.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040

4141

4242
def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
43-
percent10=False):
43+
percent10=False, download_if_missing=True):
4444
"""Load and return the kddcup 99 dataset (regression).
4545
4646
The KDD Cup '99 dataset was created by processing the tcpdump portions
@@ -93,7 +93,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
9393
9494
================ ==========================================
9595
Samples total 699691
96-
Dimensionality 40
96+
Dimensionality 4
9797
Features discrete (int) or continuous (float)
9898
Targets str, 'normal.' or name of the anomaly type
9999
================ ==========================================
@@ -102,7 +102,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
102102
103103
================ ==========================================
104104
Samples total 619052
105-
Dimensionality 39
105+
Dimensionality 3
106106
Features discrete (int) or continuous (float)
107107
Targets str, 'normal.' or name of the anomaly type
108108
================ ==========================================
@@ -111,7 +111,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
111111
112112
================ ==========================================
113113
Samples total 95373
114-
Dimensionality 39
114+
Dimensionality 3
115115
Features discrete (int) or continuous (float)
116116
Targets str, 'normal.' or name of the anomaly type
117117
================ ==========================================
@@ -135,6 +135,10 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
135135
percent10 : bool, default=False
136136
Whether to load only 10 percent of the data.
137137
138+
download_if_missing : bool, default=True
139+
If False, raise a IOError if the data is not locally available
140+
instead of trying to download the data from the source site.
141+
138142
Returns
139143
-------
140144
data : Bunch
@@ -153,13 +157,14 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
153157
Intrusions in Unlabeled Data (2002) by Eleazar Eskin, Andrew Arnold,
154158
Michael Prerau, Leonid Portnoy, Sal Stolfo
155159
"""
156-
kddcup99 = _fetch_brute_kddcup99(shuffle=shuffle, percent10=percent10)
160+
kddcup99 = _fetch_brute_kddcup99(shuffle=shuffle, percent10=percent10,
161+
download_if_missing=download_if_missing)
157162

158163
data = kddcup99.data
159164
target = kddcup99.target
160165

161166
if subset == 'SA':
162-
s = target == 'normal.'
167+
s = target == b'normal.'
163168
t = np.logical_not(s)
164169
normal_samples = data[s, :]
165170
normal_targets = target[s]
@@ -187,13 +192,13 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
187192
data[:, 5] = np.log((data[:, 5] + 0.1).astype(float))
188193

189194
if subset == 'http':
190-
s = data[:, 2] == 'http'
195+
s = data[:, 2] == b'http'
191196
data = data[s]
192197
target = target[s]
193198
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
194199

195200
if subset == 'smtp':
196-
s = data[:, 2] == 'smtp'
201+
s = data[:, 2] == b'smtp'
197202
data = data[s]
198203
target = target[s]
199204
data = np.c_[data[:, 0], data[:, 4], data[:, 5]]
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""Test kddcup99 loader. Only 'percent10' mode is tested, as the full data
2+
is to big to use in unit-testing.
3+
4+
The test is skipped if the data wasn't previously fetched and saved to
5+
scikit-learn data folder.
6+
"""
7+
8+
import errno
9+
from sklearn.datasets import fetch_kddcup99
10+
from sklearn.utils.testing import assert_equal, SkipTest
11+
12+
13+
def test_percent10():
14+
try:
15+
data = fetch_kddcup99(download_if_missing=False, percent10=True)
16+
except IOError as e:
17+
if e.errno == errno.ENOENT:
18+
raise SkipTest("kddcup99 dataset can not be loaded.")
19+
20+
assert_equal(data.data.shape, (494021, 41))
21+
assert_equal(data.target.shape, (494021,))
22+
23+
data_shuffled = fetch_kddcup99(shuffle=True, random_state=0,
24+
percent10=True)
25+
assert_equal(data.data.shape, data_shuffled.data.shape)
26+
assert_equal(data.target.shape, data_shuffled.target.shape)
27+
28+
data = fetch_kddcup99('SA', percent10=True)
29+
assert_equal(data.data.shape, (100655, 41))
30+
assert_equal(data.target.shape, (100655,))
31+
32+
data = fetch_kddcup99('SF', percent10=True)
33+
assert_equal(data.data.shape, (73237, 4))
34+
assert_equal(data.target.shape, (73237,))
35+
36+
data = fetch_kddcup99('http', percent10=True)
37+
assert_equal(data.data.shape, (58725, 3))
38+
assert_equal(data.target.shape, (58725,))
39+
print(data.data.shape, data.target.shape)
40+
41+
data = fetch_kddcup99('smtp', percent10=True)
42+
assert_equal(data.data.shape, (9571, 3))
43+
assert_equal(data.target.shape, (9571,))
44+
print(data.data.shape, data.target.shape)
45+
46+
47+
if __name__ == '__main__':
48+
test_percent10()

0 commit comments

Comments
 (0)