40
40
41
41
42
42
def fetch_kddcup99 (subset = None , shuffle = False , random_state = None ,
43
- percent10 = False ):
43
+ percent10 = False , download_if_missing = True ):
44
44
"""Load and return the kddcup 99 dataset (regression).
45
45
46
46
The KDD Cup '99 dataset was created by processing the tcpdump portions
@@ -93,7 +93,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
93
93
94
94
================ ==========================================
95
95
Samples total 699691
96
- Dimensionality 40
96
+ Dimensionality 4
97
97
Features discrete (int) or continuous (float)
98
98
Targets str, 'normal.' or name of the anomaly type
99
99
================ ==========================================
@@ -102,7 +102,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
102
102
103
103
================ ==========================================
104
104
Samples total 619052
105
- Dimensionality 39
105
+ Dimensionality 3
106
106
Features discrete (int) or continuous (float)
107
107
Targets str, 'normal.' or name of the anomaly type
108
108
================ ==========================================
@@ -111,7 +111,7 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
111
111
112
112
================ ==========================================
113
113
Samples total 95373
114
- Dimensionality 39
114
+ Dimensionality 3
115
115
Features discrete (int) or continuous (float)
116
116
Targets str, 'normal.' or name of the anomaly type
117
117
================ ==========================================
@@ -135,6 +135,10 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
135
135
percent10 : bool, default=False
136
136
Whether to load only 10 percent of the data.
137
137
138
+ download_if_missing : bool, default=True
139
+ If False, raise a IOError if the data is not locally available
140
+ instead of trying to download the data from the source site.
141
+
138
142
Returns
139
143
-------
140
144
data : Bunch
@@ -153,13 +157,14 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
153
157
Intrusions in Unlabeled Data (2002) by Eleazar Eskin, Andrew Arnold,
154
158
Michael Prerau, Leonid Portnoy, Sal Stolfo
155
159
"""
156
- kddcup99 = _fetch_brute_kddcup99 (shuffle = shuffle , percent10 = percent10 )
160
+ kddcup99 = _fetch_brute_kddcup99 (shuffle = shuffle , percent10 = percent10 ,
161
+ download_if_missing = download_if_missing )
157
162
158
163
data = kddcup99 .data
159
164
target = kddcup99 .target
160
165
161
166
if subset == 'SA' :
162
- s = target == 'normal.'
167
+ s = target == b 'normal.'
163
168
t = np .logical_not (s )
164
169
normal_samples = data [s , :]
165
170
normal_targets = target [s ]
@@ -187,13 +192,13 @@ def fetch_kddcup99(subset=None, shuffle=False, random_state=None,
187
192
data [:, 5 ] = np .log ((data [:, 5 ] + 0.1 ).astype (float ))
188
193
189
194
if subset == 'http' :
190
- s = data [:, 2 ] == 'http'
195
+ s = data [:, 2 ] == b 'http'
191
196
data = data [s ]
192
197
target = target [s ]
193
198
data = np .c_ [data [:, 0 ], data [:, 4 ], data [:, 5 ]]
194
199
195
200
if subset == 'smtp' :
196
- s = data [:, 2 ] == 'smtp'
201
+ s = data [:, 2 ] == b 'smtp'
197
202
data = data [s ]
198
203
target = target [s ]
199
204
data = np .c_ [data [:, 0 ], data [:, 4 ], data [:, 5 ]]
0 commit comments