diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 38febcaa86f66..d491d190fc7f1 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -284,9 +284,18 @@ Bug fixes left `coef_` as a list, rather than an ndarray. :issue:`8160` by :user:`CJ Carey `. + + - Fix a bug where :class:`sklearn.feature_extraction.FeatureHasher` + mandatorily applied a sparse random projection to the hashed features, + preventing the use of + :class:`sklearn.feature_extraction.text.HashingVectorizer` in a + pipeline with :class:`sklearn.feature_extraction.text.TfidfTransformer`. + :issue:`7513` by :user:`Roman Yurchak `. + - Fix a bug in cases where `numpy.cumsum` may be numerically unstable, raising an exception if instability is identified. :issue:`7376` and :issue:`7331` by `Joel Nothman`_ and :user:`yangarbiter`. + - Fix a bug where :meth:`sklearn.base.BaseEstimator.__getstate__` obstructed pickling customizations of child-classes, when used in a multiple inheritance context. diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index e0c1d1bdaece5..e39aeafa08685 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -15,7 +15,7 @@ np.import_array() @cython.boundscheck(False) @cython.cdivision(True) -def transform(raw_X, Py_ssize_t n_features, dtype): +def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1): """Guts of FeatureHasher.transform. Returns @@ -63,7 +63,9 @@ def transform(raw_X, Py_ssize_t n_features, dtype): array.resize_smart(indices, len(indices) + 1) indices[len(indices) - 1] = abs(h) % n_features - value *= (h >= 0) * 2 - 1 + # improve inner product preservation in the hashed space + if alternate_sign: + value *= (h >= 0) * 2 - 1 values[size] = value size += 1 diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 77ea749089d23..6cbf1dde0afc6 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -2,6 +2,7 @@ # License: BSD 3 clause import numbers +import warnings import numpy as np import scipy.sparse as sp @@ -53,11 +54,17 @@ class FeatureHasher(BaseEstimator, TransformerMixin): The feature_name is hashed to find the appropriate column for the feature. The value's sign might be flipped in the output (but see non_negative, below). + alternate_sign : boolean, optional, default True + When True, an alternating sign is added to the features as to + approximately conserve the inner product in the hashed space even for + small n_features. This approach is similar to sparse random projection. non_negative : boolean, optional, default False - Whether output matrices should contain non-negative values only; - effectively calls abs on the matrix prior to returning it. - When True, output values can be interpreted as frequencies. - When False, output values will have expected value zero. + When True, an absolute value is applied to the features matrix prior to + returning it. When used in conjunction with alternate_sign=True, this + significantly reduces the inner product preservation property. + .. deprecated:: 0.19 + This option will be removed in 0.21. + Examples -------- @@ -77,12 +84,17 @@ class FeatureHasher(BaseEstimator, TransformerMixin): """ def __init__(self, n_features=(2 ** 20), input_type="dict", - dtype=np.float64, non_negative=False): + dtype=np.float64, alternate_sign=True, non_negative=False): self._validate_params(n_features, input_type) + if non_negative: + warnings.warn("the option non_negative=True has been deprecated" + " in 0.19 and will be removed" + " in version 0.21.", DeprecationWarning) self.dtype = dtype self.input_type = input_type self.n_features = n_features + self.alternate_sign = alternate_sign self.non_negative = non_negative @staticmethod @@ -139,7 +151,8 @@ def transform(self, raw_X, y=None): elif self.input_type == "string": raw_X = (((f, 1) for f in x) for x in raw_X) indices, indptr, values = \ - _hashing.transform(raw_X, self.n_features, self.dtype) + _hashing.transform(raw_X, self.n_features, self.dtype, + self.alternate_sign) n_samples = indptr.shape[0] - 1 if n_samples == 0: @@ -148,6 +161,7 @@ def transform(self, raw_X, y=None): X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype, shape=(n_samples, self.n_features)) X.sum_duplicates() # also sorts the indices + if self.non_negative: np.abs(X.data, X.data) return X diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index c4905b9101ce2..0204910607f32 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -4,7 +4,8 @@ from numpy.testing import assert_array_equal from sklearn.feature_extraction import FeatureHasher -from sklearn.utils.testing import assert_raises, assert_true, assert_equal +from sklearn.utils.testing import (assert_raises, assert_true, assert_equal, + ignore_warnings) def test_feature_hasher_dicts(): @@ -106,3 +107,46 @@ def test_hasher_zeros(): # Assert that no zeros are materialized in the output. X = FeatureHasher().transform([{'foo': 0}]) assert_equal(X.data.shape, (0,)) + + +@ignore_warnings(category=DeprecationWarning) +def test_hasher_alternate_sign(): + # the last two tokens produce a hash collision that sums as 0 + X = [["foo", "bar", "baz", "investigation need", "records"]] + + Xt = FeatureHasher(alternate_sign=True, non_negative=False, + input_type='string').fit_transform(X) + assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) + # check that we have a collision that produces a 0 count + assert_true(len(Xt.data) < len(X[0])) + assert_true((Xt.data == 0.).any()) + + Xt = FeatureHasher(alternate_sign=True, non_negative=True, + input_type='string').fit_transform(X) + assert_true((Xt.data >= 0).all()) # all counts are positive + assert_true((Xt.data == 0.).any()) # we still have a collision + Xt = FeatureHasher(alternate_sign=False, non_negative=True, + input_type='string').fit_transform(X) + assert_true((Xt.data > 0).all()) # strictly positive counts + Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, + input_type='string').fit_transform(X) + # With initially positive features, the non_negative option should + # have no impact when alternate_sign=False + assert_array_equal(Xt.data, Xt_2.data) + + +@ignore_warnings(category=DeprecationWarning) +def test_hasher_negative(): + X = [{"foo": 2, "bar": -4, "baz": -1}.items()] + Xt = FeatureHasher(alternate_sign=False, non_negative=False, + input_type="pair").fit_transform(X) + assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) + Xt = FeatureHasher(alternate_sign=False, non_negative=True, + input_type="pair").fit_transform(X) + assert_true(Xt.data.min() > 0) + Xt = FeatureHasher(alternate_sign=True, non_negative=False, + input_type="pair").fit_transform(X) + assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) + Xt = FeatureHasher(alternate_sign=True, non_negative=True, + input_type="pair").fit_transform(X) + assert_true(Xt.data.min() > 0) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 3cf76187350f6..500a7c744bd5f 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -404,11 +404,20 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin): dtype : type, optional Type of the matrix returned by fit_transform() or transform(). - non_negative : boolean, default=False - Whether output matrices should contain non-negative values only; - effectively calls abs on the matrix prior to returning it. - When True, output values can be interpreted as frequencies. - When False, output values will have expected value zero. + alternate_sign : boolean, optional, default True + When True, an alternating sign is added to the features as to + approximately conserve the inner product in the hashed space even for + small n_features. This approach is similar to sparse random projection. + + .. versionadded:: 0.19 + + non_negative : boolean, optional, default False + When True, an absolute value is applied to the features matrix prior to + returning it. When used in conjunction with alternate_sign=True, this + significantly reduces the inner product preservation property. + + .. deprecated:: 0.19 + This option will be removed in 0.21. See also -------- @@ -420,8 +429,8 @@ def __init__(self, input='content', encoding='utf-8', lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20), - binary=False, norm='l2', non_negative=False, - dtype=np.float64): + binary=False, norm='l2', alternate_sign=True, + non_negative=False, dtype=np.float64): self.input = input self.encoding = encoding self.decode_error = decode_error @@ -436,6 +445,7 @@ def __init__(self, input='content', encoding='utf-8', self.ngram_range = ngram_range self.binary = binary self.norm = norm + self.alternate_sign = alternate_sign self.non_negative = non_negative self.dtype = dtype @@ -496,6 +506,7 @@ def transform(self, X, y=None): def _get_hasher(self): return FeatureHasher(n_features=self.n_features, input_type='string', dtype=self.dtype, + alternate_sign=self.alternate_sign, non_negative=self.non_negative)