diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index cd1102b77ed2e..9d5c9ba8ac1ac 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -143,6 +143,13 @@ Changelog and :class:`~sklearn.decomposition.TruncatedSVD`. :pr:`21334` by `Thomas Fan`_. +:mod:`sklearn.feature_extraction` +................................. + +- |Fix| :class:`feature_extraction.FeatureHasher` now validates input parameters + in `transform` instead of `__init__`. :pr:`21573` by + :user:`Hannah Bohle ` and :user:`Maren Westermann `. + - |API| :func:`decomposition.FastICA` now supports unit variance for whitening. The default value of its `whiten` argument will change from `True` (which behaves like `'arbitrary-variance'`) to `'unit-variance'` in version 1.3. diff --git a/sklearn/feature_extraction/_hash.py b/sklearn/feature_extraction/_hash.py index bd7cb0f371244..d49a55e71417d 100644 --- a/sklearn/feature_extraction/_hash.py +++ b/sklearn/feature_extraction/_hash.py @@ -100,8 +100,6 @@ def __init__( dtype=np.float64, alternate_sign=True, ): - self._validate_params(n_features, input_type) - self.dtype = dtype self.input_type = input_type self.n_features = n_features @@ -164,6 +162,7 @@ def transform(self, raw_X): X : sparse matrix of shape (n_samples, n_features) Feature matrix, for use with estimators or further transformers. """ + self._validate_params(self.n_features, self.input_type) raw_X = iter(raw_X) if self.input_type == "dict": raw_X = (_iteritems(d) for d in raw_X) diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index debc65ec925b8..518a6d315e309 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -9,8 +9,8 @@ def test_feature_hasher_dicts(): - h = FeatureHasher(n_features=16) - assert "dict" == h.input_type + feature_hasher = FeatureHasher(n_features=16) + assert "dict" == feature_hasher.input_type raw_X = [{"foo": "bar", "dada": 42, "tzara": 37}, {"foo": "baz", "gaga": "string1"}] X1 = FeatureHasher(n_features=16).transform(raw_X) @@ -31,10 +31,10 @@ def test_feature_hasher_strings(): it = (x for x in raw_X) # iterable - h = FeatureHasher( + feature_hasher = FeatureHasher( n_features=n_features, input_type="string", alternate_sign=False ) - X = h.transform(it) + X = feature_hasher.transform(it) assert X.shape[0] == len(raw_X) assert X.shape[1] == n_features @@ -74,8 +74,8 @@ def test_feature_hasher_pairs(): iter(d.items()) for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}] ) - h = FeatureHasher(n_features=16, input_type="pair") - x1, x2 = h.transform(raw_X).toarray() + feature_hasher = FeatureHasher(n_features=16, input_type="pair") + x1, x2 = feature_hasher.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert [1, 2] == x1_nz @@ -87,15 +87,15 @@ def test_feature_hasher_pairs_with_string_values(): iter(d.items()) for d in [{"foo": 1, "bar": "a"}, {"baz": "abc", "quux": 4, "foo": -1}] ) - h = FeatureHasher(n_features=16, input_type="pair") - x1, x2 = h.transform(raw_X).toarray() + feature_hasher = FeatureHasher(n_features=16, input_type="pair") + x1, x2 = feature_hasher.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert [1, 1] == x1_nz assert [1, 1, 4] == x2_nz raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}]) - x1, x2 = h.transform(raw_X).toarray() + x1, x2 = feature_hasher.transform(raw_X).toarray() x1_nz = np.abs(x1[x1 != 0]) x2_nz = np.abs(x2[x2 != 0]) assert [1] == x1_nz @@ -107,29 +107,35 @@ def test_hash_empty_input(): n_features = 16 raw_X = [[], (), iter(range(0))] - h = FeatureHasher(n_features=n_features, input_type="string") - X = h.transform(raw_X) + feature_hasher = FeatureHasher(n_features=n_features, input_type="string") + X = feature_hasher.transform(raw_X) assert_array_equal(X.A, np.zeros((len(raw_X), n_features))) def test_hasher_invalid_input(): + raw_X = [[], (), iter(range(0))] + + feature_hasher = FeatureHasher(input_type="gobbledygook") with pytest.raises(ValueError): - FeatureHasher(input_type="gobbledygook") + feature_hasher.transform(raw_X) + feature_hasher = FeatureHasher(n_features=-1) with pytest.raises(ValueError): - FeatureHasher(n_features=-1) + feature_hasher.transform(raw_X) + feature_hasher = FeatureHasher(n_features=0) with pytest.raises(ValueError): - FeatureHasher(n_features=0) + feature_hasher.transform(raw_X) + feature_hasher = FeatureHasher(n_features="ham") with pytest.raises(TypeError): - FeatureHasher(n_features="ham") + feature_hasher.transform(raw_X) - h = FeatureHasher(n_features=np.uint16(2 ** 6)) + feature_hasher = FeatureHasher(n_features=np.uint16(2 ** 6)) with pytest.raises(ValueError): - h.transform([]) + feature_hasher.transform([]) with pytest.raises(Exception): - h.transform([[5.5]]) + feature_hasher.transform([[5.5]]) with pytest.raises(Exception): - h.transform([[None]]) + feature_hasher.transform([[None]]) def test_hasher_set_params(): diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 8b0796b2b5dbc..0ebe2af37e7bc 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -424,7 +424,6 @@ def test_transformers_get_feature_names_out(transformer): VALIDATE_ESTIMATOR_INIT = [ "ColumnTransformer", - "FeatureHasher", "FeatureUnion", "GridSearchCV", "HalvingGridSearchCV",