1
1
import numpy as np
2
2
import scipy .sparse as sp
3
+ import pytest
3
4
from scipy .sparse import csr_matrix
4
5
5
6
from sklearn import datasets
6
7
from sklearn .utils .testing import assert_false
7
- from sklearn .utils .testing import assert_almost_equal
8
8
from sklearn .utils .testing import assert_array_equal
9
9
from sklearn .utils .testing import assert_equal
10
10
from sklearn .utils .testing import assert_raises_regexp
14
14
from sklearn .metrics .cluster import silhouette_samples
15
15
from sklearn .metrics import pairwise_distances
16
16
from sklearn .metrics .cluster import calinski_harabaz_score
17
+ from sklearn .metrics .cluster import davies_bouldin_score
17
18
18
19
19
20
def test_silhouette ():
@@ -33,13 +34,13 @@ def test_silhouette():
33
34
assert_greater (score_precomputed , 0 )
34
35
# Test without calculating D
35
36
score_euclidean = silhouette_score (X , y , metric = 'euclidean' )
36
- assert_almost_equal (score_precomputed , score_euclidean )
37
+ pytest . approx (score_precomputed , score_euclidean )
37
38
38
39
if X is X_dense :
39
40
score_dense_without_sampling = score_precomputed
40
41
else :
41
- assert_almost_equal (score_euclidean ,
42
- score_dense_without_sampling )
42
+ pytest . approx (score_euclidean ,
43
+ score_dense_without_sampling )
43
44
44
45
# Test with sampling
45
46
score_precomputed = silhouette_score (D , y , metric = 'precomputed' ,
@@ -50,12 +51,12 @@ def test_silhouette():
50
51
random_state = 0 )
51
52
assert_greater (score_precomputed , 0 )
52
53
assert_greater (score_euclidean , 0 )
53
- assert_almost_equal (score_euclidean , score_precomputed )
54
+ pytest . approx (score_euclidean , score_precomputed )
54
55
55
56
if X is X_dense :
56
57
score_dense_with_sampling = score_precomputed
57
58
else :
58
- assert_almost_equal (score_euclidean , score_dense_with_sampling )
59
+ pytest . approx (score_euclidean , score_dense_with_sampling )
59
60
60
61
61
62
def test_cluster_size_1 ():
@@ -120,12 +121,14 @@ def test_silhouette_paper_example():
120
121
(labels2 , expected2 , score2 )]:
121
122
expected = [expected [name ] for name in names ]
122
123
# we check to 2dp because that's what's in the paper
123
- assert_almost_equal (expected , silhouette_samples (D , np .array (labels ),
124
- metric = 'precomputed' ),
125
- decimal = 2 )
126
- assert_almost_equal (score , silhouette_score (D , np .array (labels ),
127
- metric = 'precomputed' ),
128
- decimal = 2 )
124
+ pytest .approx (expected ,
125
+ silhouette_samples (D , np .array (labels ),
126
+ metric = 'precomputed' ),
127
+ abs = 1e-2 )
128
+ pytest .approx (score ,
129
+ silhouette_score (D , np .array (labels ),
130
+ metric = 'precomputed' ),
131
+ abs = 1e-2 )
129
132
130
133
131
134
def test_correct_labelsize ():
@@ -166,19 +169,27 @@ def test_non_numpy_labels():
166
169
silhouette_score (list (X ), list (y )), silhouette_score (X , y ))
167
170
168
171
169
- def test_calinski_harabaz_score ():
172
+ def assert_raises_on_only_one_label (func ):
173
+ """Assert message when there is only one label"""
170
174
rng = np .random .RandomState (seed = 0 )
171
-
172
- # Assert message when there is only one label
173
175
assert_raise_message (ValueError , "Number of labels is" ,
174
- calinski_harabaz_score ,
176
+ func ,
175
177
rng .rand (10 , 2 ), np .zeros (10 ))
176
178
177
- # Assert message when all point are in different clusters
179
+
180
+ def assert_raises_on_all_points_same_cluster (func ):
181
+ """Assert message when all point are in different clusters"""
182
+ rng = np .random .RandomState (seed = 0 )
178
183
assert_raise_message (ValueError , "Number of labels is" ,
179
- calinski_harabaz_score ,
184
+ func ,
180
185
rng .rand (10 , 2 ), np .arange (10 ))
181
186
187
+
188
+ def test_calinski_harabaz_score ():
189
+ assert_raises_on_only_one_label (calinski_harabaz_score )
190
+
191
+ assert_raises_on_all_points_same_cluster (calinski_harabaz_score )
192
+
182
193
# Assert the value is 1. when all samples are equals
183
194
assert_equal (1. , calinski_harabaz_score (np .ones ((10 , 2 )),
184
195
[0 ] * 5 + [1 ] * 5 ))
@@ -191,5 +202,29 @@ def test_calinski_harabaz_score():
191
202
X = ([[0 , 0 ], [1 , 1 ]] * 5 + [[3 , 3 ], [4 , 4 ]] * 5 +
192
203
[[0 , 4 ], [1 , 3 ]] * 5 + [[3 , 1 ], [4 , 0 ]] * 5 )
193
204
labels = [0 ] * 10 + [1 ] * 10 + [2 ] * 10 + [3 ] * 10
194
- assert_almost_equal (calinski_harabaz_score (X , labels ),
205
+ pytest . approx (calinski_harabaz_score (X , labels ),
195
206
45 * (40 - 4 ) / (5 * (4 - 1 )))
207
+
208
+
209
+ def test_davies_bouldin_score ():
210
+ assert_raises_on_only_one_label (davies_bouldin_score )
211
+ assert_raises_on_all_points_same_cluster (davies_bouldin_score )
212
+
213
+ # Assert the value is 0. when all samples are equals
214
+ assert davies_bouldin_score (np .ones ((10 , 2 )),
215
+ [0 ] * 5 + [1 ] * 5 ) == pytest .approx (0.0 )
216
+
217
+ # Assert the value is 0. when all the mean cluster are equal
218
+ assert davies_bouldin_score ([[- 1 , - 1 ], [1 , 1 ]] * 10 ,
219
+ [0 ] * 10 + [1 ] * 10 ) == pytest .approx (0.0 )
220
+
221
+ # General case (with non numpy arrays)
222
+ X = ([[0 , 0 ], [1 , 1 ]] * 5 + [[3 , 3 ], [4 , 4 ]] * 5 +
223
+ [[0 , 4 ], [1 , 3 ]] * 5 + [[3 , 1 ], [4 , 0 ]] * 5 )
224
+ labels = [0 ] * 10 + [1 ] * 10 + [2 ] * 10 + [3 ] * 10
225
+ pytest .approx (davies_bouldin_score (X , labels ), 2 * np .sqrt (0.5 ) / 3 )
226
+
227
+ # General case - cluster have one sample
228
+ X = ([[0 , 0 ], [2 , 2 ], [3 , 3 ], [5 , 5 ]])
229
+ labels = [0 , 0 , 1 , 2 ]
230
+ pytest .approx (davies_bouldin_score (X , labels ), (5. / 4 ) / 3 )
0 commit comments