Skip to content
7 changes: 7 additions & 0 deletions doc/whats_new/v1.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,13 @@ Changelog
and :class:`~sklearn.decomposition.TruncatedSVD`. :pr:`21334` by
`Thomas Fan`_.

:mod:`sklearn.feature_extraction`
.................................

- |Fix| :class:`feature_extraction.FeatureHasher` now validates input parameters
in `transform` instead of `__init__`. :pr:`21573` by
:user:`Hannah Bohle <hhnnhh>` and :user:`Maren Westermann <marenwestermann>`.

- |API| :func:`decomposition.FastICA` now supports unit variance for whitening.
The default value of its `whiten` argument will change from `True`
(which behaves like `'arbitrary-variance'`) to `'unit-variance'` in version 1.3.
Expand Down
3 changes: 1 addition & 2 deletions sklearn/feature_extraction/_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,6 @@ def __init__(
dtype=np.float64,
alternate_sign=True,
):
self._validate_params(n_features, input_type)

self.dtype = dtype
self.input_type = input_type
self.n_features = n_features
Expand Down Expand Up @@ -164,6 +162,7 @@ def transform(self, raw_X):
X : sparse matrix of shape (n_samples, n_features)
Feature matrix, for use with estimators or further transformers.
"""
self._validate_params(self.n_features, self.input_type)
raw_X = iter(raw_X)
if self.input_type == "dict":
raw_X = (_iteritems(d) for d in raw_X)
Expand Down
44 changes: 25 additions & 19 deletions sklearn/feature_extraction/tests/test_feature_hasher.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@


def test_feature_hasher_dicts():
h = FeatureHasher(n_features=16)
assert "dict" == h.input_type
feature_hasher = FeatureHasher(n_features=16)
assert "dict" == feature_hasher.input_type

raw_X = [{"foo": "bar", "dada": 42, "tzara": 37}, {"foo": "baz", "gaga": "string1"}]
X1 = FeatureHasher(n_features=16).transform(raw_X)
Expand All @@ -31,10 +31,10 @@ def test_feature_hasher_strings():

it = (x for x in raw_X) # iterable

h = FeatureHasher(
feature_hasher = FeatureHasher(
n_features=n_features, input_type="string", alternate_sign=False
)
X = h.transform(it)
X = feature_hasher.transform(it)

assert X.shape[0] == len(raw_X)
assert X.shape[1] == n_features
Expand Down Expand Up @@ -74,8 +74,8 @@ def test_feature_hasher_pairs():
iter(d.items())
for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}]
)
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
feature_hasher = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = feature_hasher.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert [1, 2] == x1_nz
Expand All @@ -87,15 +87,15 @@ def test_feature_hasher_pairs_with_string_values():
iter(d.items())
for d in [{"foo": 1, "bar": "a"}, {"baz": "abc", "quux": 4, "foo": -1}]
)
h = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = h.transform(raw_X).toarray()
feature_hasher = FeatureHasher(n_features=16, input_type="pair")
x1, x2 = feature_hasher.transform(raw_X).toarray()
x1_nz = sorted(np.abs(x1[x1 != 0]))
x2_nz = sorted(np.abs(x2[x2 != 0]))
assert [1, 1] == x1_nz
assert [1, 1, 4] == x2_nz

raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}])
x1, x2 = h.transform(raw_X).toarray()
x1, x2 = feature_hasher.transform(raw_X).toarray()
x1_nz = np.abs(x1[x1 != 0])
x2_nz = np.abs(x2[x2 != 0])
assert [1] == x1_nz
Expand All @@ -107,29 +107,35 @@ def test_hash_empty_input():
n_features = 16
raw_X = [[], (), iter(range(0))]

h = FeatureHasher(n_features=n_features, input_type="string")
X = h.transform(raw_X)
feature_hasher = FeatureHasher(n_features=n_features, input_type="string")
X = feature_hasher.transform(raw_X)

assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))


def test_hasher_invalid_input():
raw_X = [[], (), iter(range(0))]

feature_hasher = FeatureHasher(input_type="gobbledygook")
with pytest.raises(ValueError):
FeatureHasher(input_type="gobbledygook")
feature_hasher.transform(raw_X)
feature_hasher = FeatureHasher(n_features=-1)
with pytest.raises(ValueError):
FeatureHasher(n_features=-1)
feature_hasher.transform(raw_X)
feature_hasher = FeatureHasher(n_features=0)
with pytest.raises(ValueError):
FeatureHasher(n_features=0)
feature_hasher.transform(raw_X)
feature_hasher = FeatureHasher(n_features="ham")
with pytest.raises(TypeError):
FeatureHasher(n_features="ham")
feature_hasher.transform(raw_X)

h = FeatureHasher(n_features=np.uint16(2 ** 6))
feature_hasher = FeatureHasher(n_features=np.uint16(2 ** 6))
with pytest.raises(ValueError):
h.transform([])
feature_hasher.transform([])
with pytest.raises(Exception):
h.transform([[5.5]])
feature_hasher.transform([[5.5]])
with pytest.raises(Exception):
h.transform([[None]])
feature_hasher.transform([[None]])


def test_hasher_set_params():
Expand Down
1 change: 0 additions & 1 deletion sklearn/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,6 @@ def test_transformers_get_feature_names_out(transformer):

VALIDATE_ESTIMATOR_INIT = [
"ColumnTransformer",
"FeatureHasher",
"FeatureUnion",
"GridSearchCV",
"HalvingGridSearchCV",
Expand Down