Skip to content

Commit 4c1f7df

Browse files
committed
Moving the collision test to test_feature_hasher.py
1 parent c9452cf commit 4c1f7df

File tree

2 files changed

+12
-9
lines changed

2 files changed

+12
-9
lines changed

sklearn/feature_extraction/tests/test_feature_hasher.py

+12
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,15 @@ def test_hasher_zeros():
107107
# Assert that no zeros are materialized in the output.
108108
X = FeatureHasher().transform([{'foo': 0}])
109109
assert_equal(X.data.shape, (0,))
110+
111+
112+
def test_hash_collision():
113+
# Ensure that hash collision does not produce zero elements
114+
# in the output sparse array (issue #3637)
115+
raw_X = ["ab", "ac", "ad", "ae"]
116+
fh = FeatureHasher(non_negative=False, input_type="string", n_features=1)
117+
X = fh.transform(raw_X)
118+
assert_true(len(X.data) <= 2*len(raw_X)) # we have feature collisions
119+
assert_equal((X.data == 0.).sum(), 0)
120+
121+

sklearn/feature_extraction/tests/test_text.py

-9
Original file line numberDiff line numberDiff line change
@@ -944,15 +944,6 @@ def func():
944944
assert_raise_message(exception, message, func)
945945

946946

947-
def test_hashingvectorizer_hash_collision():
948-
# Ensure that hash collision does not produce zero elements
949-
# in the output sparse array (issue #3637)
950-
text = 'investigation need records'
951-
hv = HashingVectorizer(ngram_range=(1, 2), non_negative=True)
952-
X = hv.transform([text])
953-
assert_equal((X.data == 0.).sum(), 0)
954-
955-
956947
def test_tfidfvectorizer_binary():
957948
# Non-regression test: TfidfVectorizer used to ignore its "binary" param.
958949
v = TfidfVectorizer(binary=True, use_idf=False, norm=None)

0 commit comments

Comments
 (0)