Skip to content

Commit 52d93e1

Browse files
hqkqn32jeremiedbb
andauthored
Fix requires_fit tag for stateless FeatureHasher and HashingVectorizer (#31851)
Co-authored-by: Jérémie du Boisberranger <jeremie@probabl.ai>
1 parent 760edca commit 52d93e1

File tree

5 files changed

+36
-0
lines changed

5 files changed

+36
-0
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
- Set the tag `requires_fit=False` for the classes
2+
:class:`feature_extraction.FeatureHasher` and
3+
:class:`feature_extraction.HashingVectorizer`.
4+
By :user:`hakan çanakcı <hqkqn32>`.

sklearn/feature_extraction/_hash.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,4 +204,5 @@ def __sklearn_tags__(self):
204204
tags.input_tags.string = True
205205
elif self.input_type == "dict":
206206
tags.input_tags.dict = True
207+
tags.requires_fit = False
207208
return tags

sklearn/feature_extraction/tests/test_feature_hasher.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,18 @@ def test_hash_collisions():
158158
alternate_sign=False, n_features=1, input_type="string"
159159
).fit_transform(X)
160160
assert Xt.data[0] == len(X[0])
161+
162+
163+
def test_feature_hasher_requires_fit_tag():
164+
"""Test that FeatureHasher has requires_fit=False tag."""
165+
hasher = FeatureHasher()
166+
tags = hasher.__sklearn_tags__()
167+
assert not tags.requires_fit
168+
169+
170+
def test_feature_hasher_transform_without_fit():
171+
"""Test that FeatureHasher can transform without fitting."""
172+
hasher = FeatureHasher(n_features=10)
173+
data = [{"dog": 1, "cat": 2}, {"dog": 2, "run": 5}]
174+
result = hasher.transform(data)
175+
assert result.shape == (2, 10)

sklearn/feature_extraction/tests/test_text.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1626,3 +1626,18 @@ def test_tfidf_vectorizer_perserve_dtype_idf(dtype):
16261626
X = [str(uuid.uuid4()) for i in range(100_000)]
16271627
vectorizer = TfidfVectorizer(dtype=dtype).fit(X)
16281628
assert vectorizer.idf_.dtype == dtype
1629+
1630+
1631+
def test_hashing_vectorizer_requires_fit_tag():
1632+
"""Test that HashingVectorizer has requires_fit=False tag."""
1633+
vectorizer = HashingVectorizer()
1634+
tags = vectorizer.__sklearn_tags__()
1635+
assert not tags.requires_fit
1636+
1637+
1638+
def test_hashing_vectorizer_transform_without_fit():
1639+
"""Test that HashingVectorizer can transform without fitting."""
1640+
vectorizer = HashingVectorizer(n_features=10)
1641+
corpus = ["This is test", "Another test"]
1642+
result = vectorizer.transform(corpus)
1643+
assert result.shape == (2, 10)

sklearn/feature_extraction/text.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -923,6 +923,7 @@ def __sklearn_tags__(self):
923923
tags = super().__sklearn_tags__()
924924
tags.input_tags.string = True
925925
tags.input_tags.two_d_array = False
926+
tags.requires_fit = False
926927
return tags
927928

928929

0 commit comments

Comments
 (0)