From d0ff000c7e8762242d997eabec950893a4605a61 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 3 Oct 2016 18:44:26 +0200 Subject: [PATCH 01/13] HashingVectorizer: optionaly disable alternate signs --- sklearn/feature_extraction/_hashing.pyx | 5 +++-- sklearn/feature_extraction/hashing.py | 9 ++++++--- sklearn/feature_extraction/text.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index 39c2b10378132..4a8d00a20f38b 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -15,7 +15,7 @@ np.import_array() @cython.boundscheck(False) @cython.cdivision(True) -def transform(raw_X, Py_ssize_t n_features, dtype): +def transform(raw_X, Py_ssize_t n_features, dtype, char alternate_sign): """Guts of FeatureHasher.transform. Returns @@ -63,7 +63,8 @@ def transform(raw_X, Py_ssize_t n_features, dtype): array.resize_smart(indices, len(indices) + 1) indices[len(indices) - 1] = abs(h) % n_features - value *= (h >= 0) * 2 - 1 + if alternate_sign: # counter the effect of hash collision (issue #7513) + value *= (h >= 0) * 2 - 1 values[size] = value size += 1 diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 77ea749089d23..b3331e5252839 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -79,7 +79,9 @@ class FeatureHasher(BaseEstimator, TransformerMixin): def __init__(self, n_features=(2 ** 20), input_type="dict", dtype=np.float64, non_negative=False): self._validate_params(n_features, input_type) - + if non_negative not in [True, False, 'total']: + raise ValueError("Invalid value for non_negative must be one of" + " True, False, 'total' ") self.dtype = dtype self.input_type = input_type self.n_features = n_features @@ -139,7 +141,8 @@ def transform(self, raw_X, y=None): elif self.input_type == "string": raw_X = (((f, 1) for f in x) for x in raw_X) indices, indptr, values = \ - _hashing.transform(raw_X, self.n_features, self.dtype) + _hashing.transform(raw_X, self.n_features, self.dtype, + self.non_negative != 'total') n_samples = indptr.shape[0] - 1 if n_samples == 0: @@ -148,6 +151,6 @@ def transform(self, raw_X, y=None): X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype, shape=(n_samples, self.n_features)) X.sum_duplicates() # also sorts the indices - if self.non_negative: + if self.non_negative is True: # if non_negative == 'total', X > 0 anyway np.abs(X.data, X.data) return X diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index f5b548a5278cd..7443c706d782e 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -405,7 +405,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin): dtype : type, optional Type of the matrix returned by fit_transform() or transform(). - non_negative : boolean, default=False + non_negative : boolean or 'total', default=False Whether output matrices should contain non-negative values only; effectively calls abs on the matrix prior to returning it. When True, output values can be interpreted as frequencies. From 0d9919f47e9563a5842a150a1741e7f8b7eaec42 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 3 Oct 2016 19:07:11 +0200 Subject: [PATCH 02/13] HashingVectorizer updating documentation --- sklearn/feature_extraction/hashing.py | 13 +++++++------ sklearn/feature_extraction/text.py | 11 ++++++----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index b3331e5252839..a9a7a35c6a1a0 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -53,11 +53,12 @@ class FeatureHasher(BaseEstimator, TransformerMixin): The feature_name is hashed to find the appropriate column for the feature. The value's sign might be flipped in the output (but see non_negative, below). - non_negative : boolean, optional, default False - Whether output matrices should contain non-negative values only; - effectively calls abs on the matrix prior to returning it. - When True, output values can be interpreted as frequencies. - When False, output values will have expected value zero. + non_negative : boolean or 'total', optional, default False + When True or False, an alternating sign is added to the counts as to + approximately conserve the inner product in the hashed space. + When True, an absolute value is additionally applied to the result + prior to returning it. + When 'total' all counts are positive which disables collision handling. Examples -------- @@ -151,6 +152,6 @@ def transform(self, raw_X, y=None): X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype, shape=(n_samples, self.n_features)) X.sum_duplicates() # also sorts the indices - if self.non_negative is True: # if non_negative == 'total', X > 0 anyway + if self.non_negative is True: # if non_negative == 'total', X>0 anyway np.abs(X.data, X.data) return X diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 7443c706d782e..6bc6cdbafb692 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -405,11 +405,12 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin): dtype : type, optional Type of the matrix returned by fit_transform() or transform(). - non_negative : boolean or 'total', default=False - Whether output matrices should contain non-negative values only; - effectively calls abs on the matrix prior to returning it. - When True, output values can be interpreted as frequencies. - When False, output values will have expected value zero. + non_negative : boolean or 'total', optional, default=False + When True or False, an alternating sign is added to the counts as to + approximately conserve the inner product in the hashed space. + When True, an absolute value is additionally applied to the result + prior to returning it. + When 'total' all counts are positive which disables collision handling. See also -------- From 28d055f7b039c2271e36d923175f19657b85394b Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 3 Oct 2016 19:24:26 +0200 Subject: [PATCH 03/13] Adding unit tests for non negative --- sklearn/feature_extraction/hashing.py | 2 +- .../tests/test_feature_hasher.py | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index a9a7a35c6a1a0..04dbfb98b8657 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -82,7 +82,7 @@ def __init__(self, n_features=(2 ** 20), input_type="dict", self._validate_params(n_features, input_type) if non_negative not in [True, False, 'total']: raise ValueError("Invalid value for non_negative must be one of" - " True, False, 'total' ") + " True, False, 'total'.") self.dtype = dtype self.input_type = input_type self.n_features = n_features diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index c4905b9101ce2..72c6c2318d408 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -106,3 +106,21 @@ def test_hasher_zeros(): # Assert that no zeros are materialized in the output. X = FeatureHasher().transform([{'foo': 0}]) assert_equal(X.data.shape, (0,)) + + +def test_hasher_non_negative(): + raw_X = [["foo", "bar", "baz"]] + + def it(): # iterable + return (x for x in raw_X) + + X = FeatureHasher(non_negative=False, + input_type='string').fit_transform(it()) + assert_true((X.data > 0).any() and (X.data < 0).any()) + X = FeatureHasher(non_negative=True, + input_type='string').fit_transform(it()) + assert_true((X.data >= 0).all()) # zeros are acceptable + X = FeatureHasher(non_negative='total', + input_type='string').fit_transform(it()) + assert_true((X.data > 0).all()) # strictly positive counts + assert_raises(ValueError, FeatureHasher, non_negative=None) From 06779c030cf8cbc46541c0bea38ef1d2a02ca8fa Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 2 Nov 2016 13:11:58 +0100 Subject: [PATCH 04/13] Addressing review comments --- sklearn/feature_extraction/_hashing.pyx | 5 ++-- sklearn/feature_extraction/hashing.py | 23 +++++++++++++++---- .../tests/test_feature_hasher.py | 9 +++----- sklearn/feature_extraction/text.py | 14 +++++++---- 4 files changed, 33 insertions(+), 18 deletions(-) diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index 4a8d00a20f38b..465f8548d2338 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -15,7 +15,7 @@ np.import_array() @cython.boundscheck(False) @cython.cdivision(True) -def transform(raw_X, Py_ssize_t n_features, dtype, char alternate_sign): +def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign): """Guts of FeatureHasher.transform. Returns @@ -63,7 +63,8 @@ def transform(raw_X, Py_ssize_t n_features, dtype, char alternate_sign): array.resize_smart(indices, len(indices) + 1) indices[len(indices) - 1] = abs(h) % n_features - if alternate_sign: # counter the effect of hash collision (issue #7513) + if alternate_sign: # improve inner product preservation + # in the hashed space (issue #7513) value *= (h >= 0) * 2 - 1 values[size] = value size += 1 diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 04dbfb98b8657..0e4e594858ddb 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -2,6 +2,7 @@ # License: BSD 3 clause import numbers +import warnings import numpy as np import scipy.sparse as sp @@ -54,11 +55,15 @@ class FeatureHasher(BaseEstimator, TransformerMixin): feature. The value's sign might be flipped in the output (but see non_negative, below). non_negative : boolean or 'total', optional, default False - When True or False, an alternating sign is added to the counts as to - approximately conserve the inner product in the hashed space. - When True, an absolute value is additionally applied to the result - prior to returning it. - When 'total' all counts are positive which disables collision handling. + When False, an alternating sign is added to the features as to + approximately conserve the inner product in the hashed space even for + small n_features in a similar approach to the sparse random projection. + When True, the behaviour is identical to the one with (False) with an + additional absolute value applied to the result prior to returning it. + This significantly reduces the inner product preservation property and + is deprecated as of 0.19. + When 'total' all counts are positive and for small n_features values + the inner product will not be preserved in the hashed space. Examples -------- @@ -83,6 +88,12 @@ def __init__(self, n_features=(2 ** 20), input_type="dict", if non_negative not in [True, False, 'total']: raise ValueError("Invalid value for non_negative must be one of" " True, False, 'total'.") + if non_negative in ['total', True]: + warnings.warn("the option non_negative=True has been deprecated" + " in 0.19. As of 0.21 non_negative='total' would be" + " renamed to non_negative=True.", + DeprecationWarning) + self.dtype = dtype self.input_type = input_type self.n_features = n_features @@ -153,5 +164,7 @@ def transform(self, raw_X, y=None): shape=(n_samples, self.n_features)) X.sum_duplicates() # also sorts the indices if self.non_negative is True: # if non_negative == 'total', X>0 anyway + # this is deprecated as of 0.19 + # and should be removed in 0.21 np.abs(X.data, X.data) return X diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index 72c6c2318d408..ef9706bc52150 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -111,16 +111,13 @@ def test_hasher_zeros(): def test_hasher_non_negative(): raw_X = [["foo", "bar", "baz"]] - def it(): # iterable - return (x for x in raw_X) - X = FeatureHasher(non_negative=False, - input_type='string').fit_transform(it()) + input_type='string').fit_transform(raw_X) assert_true((X.data > 0).any() and (X.data < 0).any()) X = FeatureHasher(non_negative=True, - input_type='string').fit_transform(it()) + input_type='string').fit_transform(raw_X) assert_true((X.data >= 0).all()) # zeros are acceptable X = FeatureHasher(non_negative='total', - input_type='string').fit_transform(it()) + input_type='string').fit_transform(raw_X) assert_true((X.data > 0).all()) # strictly positive counts assert_raises(ValueError, FeatureHasher, non_negative=None) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 6bc6cdbafb692..b2060757cc44c 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -406,11 +406,15 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin): Type of the matrix returned by fit_transform() or transform(). non_negative : boolean or 'total', optional, default=False - When True or False, an alternating sign is added to the counts as to - approximately conserve the inner product in the hashed space. - When True, an absolute value is additionally applied to the result - prior to returning it. - When 'total' all counts are positive which disables collision handling. + When False, an alternating sign is added to the features as to + approximately conserve the inner product in the hashed space even for + small n_features in a similar approach to the sparse random projection. + When True, the behaviour is identical to the one with (False) with an + additional absolute value applied to the result prior to returning it. + This significantly reduces the inner product preservation property and + is deprecated as of 0.19. + When 'total' all counts are positive and for small n_features values + the inner product will not be preserved in the hashed space. See also -------- From 9329ba7329ec36fe9267ec7c0a3a7fa6734d71fb Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 2 Nov 2016 13:36:26 +0100 Subject: [PATCH 05/13] Adressing more review comments --- sklearn/feature_extraction/hashing.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 0e4e594858ddb..419c68bb2e5f0 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -88,11 +88,11 @@ def __init__(self, n_features=(2 ** 20), input_type="dict", if non_negative not in [True, False, 'total']: raise ValueError("Invalid value for non_negative must be one of" " True, False, 'total'.") - if non_negative in ['total', True]: + if non_negative is True: warnings.warn("the option non_negative=True has been deprecated" - " in 0.19. As of 0.21 non_negative='total' would be" - " renamed to non_negative=True.", - DeprecationWarning) + " in 0.19. From version 0.21, non_negative=True" + " will be interpreted as" + " non_negative='total'.", DeprecationWarning) self.dtype = dtype self.input_type = input_type From 34f08a79c3402c37ed17f5c6471a29dc4ad60a9a Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 2 Nov 2016 13:44:45 +0100 Subject: [PATCH 06/13] Missed a few review comments --- .../tests/test_feature_hasher.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index ef9706bc52150..9d1b2a976b955 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -109,15 +109,15 @@ def test_hasher_zeros(): def test_hasher_non_negative(): - raw_X = [["foo", "bar", "baz"]] - - X = FeatureHasher(non_negative=False, - input_type='string').fit_transform(raw_X) - assert_true((X.data > 0).any() and (X.data < 0).any()) - X = FeatureHasher(non_negative=True, - input_type='string').fit_transform(raw_X) - assert_true((X.data >= 0).all()) # zeros are acceptable - X = FeatureHasher(non_negative='total', - input_type='string').fit_transform(raw_X) - assert_true((X.data > 0).all()) # strictly positive counts + X = [["foo", "bar", "baz"]] + + Xt = FeatureHasher(non_negative=False, + input_type='string').fit_transform(X) + assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) + Xt = FeatureHasher(non_negative=True, + input_type='string').fit_transform(X) + assert_true((Xt.data >= 0).all()) # zeros are acceptable + Xt = FeatureHasher(non_negative='total', + input_type='string').fit_transform(X) + assert_true((Xt.data > 0).all()) # strictly positive counts assert_raises(ValueError, FeatureHasher, non_negative=None) From 2b50a903b3c917216c1276099f08cf8a244dbb3e Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 2 Nov 2016 14:11:38 +0100 Subject: [PATCH 07/13] Adding an example that produces a hash collision --- sklearn/feature_extraction/hashing.py | 5 ++--- .../tests/test_feature_hasher.py | 18 ++++++++++++------ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 419c68bb2e5f0..69d905574d55a 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -163,8 +163,7 @@ def transform(self, raw_X, y=None): X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype, shape=(n_samples, self.n_features)) X.sum_duplicates() # also sorts the indices - if self.non_negative is True: # if non_negative == 'total', X>0 anyway - # this is deprecated as of 0.19 - # and should be removed in 0.21 + + if self.non_negative in [True, 'total']: np.abs(X.data, X.data) return X diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index 9d1b2a976b955..adef4f57b0820 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -109,15 +109,21 @@ def test_hasher_zeros(): def test_hasher_non_negative(): - X = [["foo", "bar", "baz"]] + X = [["foo", "bar", "baz", "investigation need", "records"]] + # last two tokens produce a hash collision that sums as 0 Xt = FeatureHasher(non_negative=False, - input_type='string').fit_transform(X) + input_type='string').fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) + # check that we have a collision that produces a 0 count + assert_true(len(Xt.data) < len(X[0])) + assert_true((Xt.data == 0.).any()) + Xt = FeatureHasher(non_negative=True, - input_type='string').fit_transform(X) - assert_true((Xt.data >= 0).all()) # zeros are acceptable + input_type='string').fit_transform(X) + assert_true((Xt.data >= 0).all()) # all counts are positive + assert_true((Xt.data == 0.).any()) # we still have a collision Xt = FeatureHasher(non_negative='total', - input_type='string').fit_transform(X) - assert_true((Xt.data > 0).all()) # strictly positive counts + input_type='string').fit_transform(X) + assert_true((Xt.data > 0).all()) # strictly positive counts assert_raises(ValueError, FeatureHasher, non_negative=None) From 1364bf3e992ed5b3fec05555091e3f0591aaad6d Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 6 Nov 2016 22:17:35 +0100 Subject: [PATCH 08/13] Adding a alternate_sign parameter to the FeatureHasher --- sklearn/feature_extraction/hashing.py | 35 +++++++++---------- .../tests/test_feature_hasher.py | 12 ++++--- sklearn/feature_extraction/text.py | 25 ++++++------- 3 files changed, 37 insertions(+), 35 deletions(-) diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 69d905574d55a..9c4b6d3b618ad 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -54,16 +54,16 @@ class FeatureHasher(BaseEstimator, TransformerMixin): The feature_name is hashed to find the appropriate column for the feature. The value's sign might be flipped in the output (but see non_negative, below). - non_negative : boolean or 'total', optional, default False - When False, an alternating sign is added to the features as to + alternate_sign : boolean, optional, default True + When True, an alternating sign is added to the features as to approximately conserve the inner product in the hashed space even for - small n_features in a similar approach to the sparse random projection. - When True, the behaviour is identical to the one with (False) with an - additional absolute value applied to the result prior to returning it. - This significantly reduces the inner product preservation property and - is deprecated as of 0.19. - When 'total' all counts are positive and for small n_features values - the inner product will not be preserved in the hashed space. + small n_features. This approach is similar to sparse random projection. + non_negative : boolean, optional, default False + When True, an absolute value is applied to the features matrix prior to + returning it. When used in conjunction with alternate_sign=True, this + significantly reduces the inner product preservation property. + This option is deprecated as of 0.19. + Examples -------- @@ -83,20 +83,17 @@ class FeatureHasher(BaseEstimator, TransformerMixin): """ def __init__(self, n_features=(2 ** 20), input_type="dict", - dtype=np.float64, non_negative=False): + dtype=np.float64, alternate_sign=True, non_negative=False): self._validate_params(n_features, input_type) - if non_negative not in [True, False, 'total']: - raise ValueError("Invalid value for non_negative must be one of" - " True, False, 'total'.") - if non_negative is True: + if non_negative: warnings.warn("the option non_negative=True has been deprecated" - " in 0.19. From version 0.21, non_negative=True" - " will be interpreted as" - " non_negative='total'.", DeprecationWarning) + " in 0.19 and will be removed" + " in version 0.21.", DeprecationWarning) self.dtype = dtype self.input_type = input_type self.n_features = n_features + self.alternate_sign = alternate_sign self.non_negative = non_negative @staticmethod @@ -154,7 +151,7 @@ def transform(self, raw_X, y=None): raw_X = (((f, 1) for f in x) for x in raw_X) indices, indptr, values = \ _hashing.transform(raw_X, self.n_features, self.dtype, - self.non_negative != 'total') + self.alternate_sign) n_samples = indptr.shape[0] - 1 if n_samples == 0: @@ -164,6 +161,6 @@ def transform(self, raw_X, y=None): shape=(n_samples, self.n_features)) X.sum_duplicates() # also sorts the indices - if self.non_negative in [True, 'total']: + if self.non_negative: np.abs(X.data, X.data) return X diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index adef4f57b0820..e51de84ebba9b 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -112,18 +112,22 @@ def test_hasher_non_negative(): X = [["foo", "bar", "baz", "investigation need", "records"]] # last two tokens produce a hash collision that sums as 0 - Xt = FeatureHasher(non_negative=False, + Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type='string').fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) # check that we have a collision that produces a 0 count assert_true(len(Xt.data) < len(X[0])) assert_true((Xt.data == 0.).any()) - Xt = FeatureHasher(non_negative=True, + Xt = FeatureHasher(alternate_sign=True, non_negative=True, input_type='string').fit_transform(X) assert_true((Xt.data >= 0).all()) # all counts are positive assert_true((Xt.data == 0.).any()) # we still have a collision - Xt = FeatureHasher(non_negative='total', + Xt = FeatureHasher(alternate_sign=False, non_negative=True, input_type='string').fit_transform(X) assert_true((Xt.data > 0).all()) # strictly positive counts - assert_raises(ValueError, FeatureHasher, non_negative=None) + Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False, + input_type='string').fit_transform(X) + # With initially positive features, the non_negative option should + # have no impact when alternate_sign=False + assert_equal(Xt.data, Xt_2.data) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index b2060757cc44c..b47a55d25c72b 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -29,7 +29,6 @@ from ..preprocessing import normalize from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS -from ..utils import deprecated from ..utils.fixes import frombuffer_empty, bincount from ..utils.validation import check_is_fitted @@ -405,16 +404,16 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin): dtype : type, optional Type of the matrix returned by fit_transform() or transform(). - non_negative : boolean or 'total', optional, default=False - When False, an alternating sign is added to the features as to + alternate_sign : boolean, optional, default True + When True, an alternating sign is added to the features as to approximately conserve the inner product in the hashed space even for - small n_features in a similar approach to the sparse random projection. - When True, the behaviour is identical to the one with (False) with an - additional absolute value applied to the result prior to returning it. - This significantly reduces the inner product preservation property and - is deprecated as of 0.19. - When 'total' all counts are positive and for small n_features values - the inner product will not be preserved in the hashed space. + small n_features. This approach is similar to sparse random projection. + + non_negative : boolean, optional, default False + When True, an absolute value is applied to the features matrix prior to + returning it. When used in conjunction with alternate_sign=True, this + significantly reduces the inner product preservation property. + This option is deprecated as of 0.19. See also -------- @@ -426,8 +425,8 @@ def __init__(self, input='content', encoding='utf-8', lowercase=True, preprocessor=None, tokenizer=None, stop_words=None, token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20), - binary=False, norm='l2', non_negative=False, - dtype=np.float64): + binary=False, norm='l2', alternate_sign=True, + non_negative=False, dtype=np.float64): self.input = input self.encoding = encoding self.decode_error = decode_error @@ -442,6 +441,7 @@ def __init__(self, input='content', encoding='utf-8', self.ngram_range = ngram_range self.binary = binary self.norm = norm + self.alternate_sign = alternate_sign self.non_negative = non_negative self.dtype = dtype @@ -502,6 +502,7 @@ def transform(self, X, y=None): def _get_hasher(self): return FeatureHasher(n_features=self.n_features, input_type='string', dtype=self.dtype, + alternate_sign=self.alternate_sign, non_negative=self.non_negative) From b960a19c48f4e0991f96cfc7737268abb9753c26 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Sun, 6 Nov 2016 22:48:53 +0100 Subject: [PATCH 09/13] Adding a mixed-sign test and deprecated directives --- sklearn/feature_extraction/hashing.py | 3 ++- .../tests/test_feature_hasher.py | 24 +++++++++++++++---- sklearn/feature_extraction/text.py | 4 +++- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py index 9c4b6d3b618ad..6cbf1dde0afc6 100644 --- a/sklearn/feature_extraction/hashing.py +++ b/sklearn/feature_extraction/hashing.py @@ -62,7 +62,8 @@ class FeatureHasher(BaseEstimator, TransformerMixin): When True, an absolute value is applied to the features matrix prior to returning it. When used in conjunction with alternate_sign=True, this significantly reduces the inner product preservation property. - This option is deprecated as of 0.19. + .. deprecated:: 0.19 + This option will be removed in 0.21. Examples diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index e51de84ebba9b..c76513d29bfc6 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -108,11 +108,11 @@ def test_hasher_zeros(): assert_equal(X.data.shape, (0,)) -def test_hasher_non_negative(): +def test_hasher_alternate_sign(): X = [["foo", "bar", "baz", "investigation need", "records"]] - # last two tokens produce a hash collision that sums as 0 + # the last two tokens produce a hash collision that sums as 0 - Xt = FeatureHasher(alternate_sign=True, non_negative=False, + Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type='string').fit_transform(X) assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) # check that we have a collision that produces a 0 count @@ -130,4 +130,20 @@ def test_hasher_non_negative(): input_type='string').fit_transform(X) # With initially positive features, the non_negative option should # have no impact when alternate_sign=False - assert_equal(Xt.data, Xt_2.data) + assert_array_equal(Xt.data, Xt_2.data) + + +def test_hasher_negative(): + X = [{"foo": 2, "bar": -4, "baz": -1}.items()] + Xt = FeatureHasher(alternate_sign=False, non_negative=False, + input_type="pair").fit_transform(X) + assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) + Xt = FeatureHasher(alternate_sign=False, non_negative=True, + input_type="pair").fit_transform(X) + assert_true(Xt.data.min() > 0) + Xt = FeatureHasher(alternate_sign=True, non_negative=False, + input_type="pair").fit_transform(X) + assert_true(Xt.data.min() < 0 and Xt.data.max() > 0) + Xt = FeatureHasher(alternate_sign=True, non_negative=True, + input_type="pair").fit_transform(X) + assert_true(Xt.data.min() > 0) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index b47a55d25c72b..8d1b79fd668fc 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -413,7 +413,9 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin): When True, an absolute value is applied to the features matrix prior to returning it. When used in conjunction with alternate_sign=True, this significantly reduces the inner product preservation property. - This option is deprecated as of 0.19. + + .. deprecated:: 0.19 + This option will be removed in 0.21. See also -------- From b390f4cef2817f895010f4bd339229046aebea96 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Mon, 23 Jan 2017 16:40:38 +0100 Subject: [PATCH 10/13] Added an entry in what's new --- doc/whats_new.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 9d76f5377c0e1..51d6aca5f326b 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -198,6 +198,12 @@ Bug fixes left `coef_` as a list, rather than an ndarray. :issue:`8160` by :user:`CJ Carey `. + - Fix a bug where `sklearn.feature_extraction.FeatureHasher` mandatorily + applied a sparse random projection to the hashed features, preventing + the use of `sklearn.feature_extraction.text.HashingVectorizer` in a + pipeline with `sklearn.feature_extraction.text.TfidfTransformer`. + :issue:`3637`, :issue:`7513` by `Roman Yurchal `. + API changes summary ------------------- From a85f1abf441ec8b008796231af8487c62a2730fc Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Fri, 27 Jan 2017 14:11:57 +0100 Subject: [PATCH 11/13] Addressing review comments --- doc/whats_new.rst | 2 +- sklearn/feature_extraction/_hashing.pyx | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 51d6aca5f326b..185662ee82411 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -202,7 +202,7 @@ Bug fixes applied a sparse random projection to the hashed features, preventing the use of `sklearn.feature_extraction.text.HashingVectorizer` in a pipeline with `sklearn.feature_extraction.text.TfidfTransformer`. - :issue:`3637`, :issue:`7513` by `Roman Yurchal `. + :issue:`7513` by `Roman Yurchal `. API changes summary ------------------- diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index 465f8548d2338..3305e6be2bbd9 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -63,8 +63,8 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign): array.resize_smart(indices, len(indices) + 1) indices[len(indices) - 1] = abs(h) % n_features - if alternate_sign: # improve inner product preservation - # in the hashed space (issue #7513) + # improve inner product preservation in the hashed space + if alternate_sign: value *= (h >= 0) * 2 - 1 values[size] = value size += 1 From f3e0d25cf7b1883bd6bb204f35d5384b0c3e1081 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 30 Mar 2017 17:28:16 +0200 Subject: [PATCH 12/13] Fixed rst formatting --- doc/whats_new.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index ea9e743fc4c64..2204b467cd336 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -240,11 +240,12 @@ Bug fixes :issue:`8160` by :user:`CJ Carey `. - - Fix a bug where `sklearn.feature_extraction.FeatureHasher` mandatorily - applied a sparse random projection to the hashed features, preventing - the use of `sklearn.feature_extraction.text.HashingVectorizer` in a - pipeline with `sklearn.feature_extraction.text.TfidfTransformer`. - :issue:`7513` by `Roman Yurchal `. + - Fix a bug where :class:`sklearn.feature_extraction.FeatureHasher` + mandatorily applied a sparse random projection to the hashed features, + preventing the use of + :class:`sklearn.feature_extraction.text.HashingVectorizer` in a + pipeline with :class:`sklearn.feature_extraction.text.TfidfTransformer`. + :issue:`7513` by :user:`Roman Yurchak `. - Fix a bug in cases where `numpy.cumsum` may be numerically unstable, raising an exception if instability is identified. :issue:`7376` and From c51b45bf9276c1426818ddef1070b8c559ec7a50 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 8 Jun 2017 08:34:10 +0200 Subject: [PATCH 13/13] Fixing review comments --- sklearn/feature_extraction/_hashing.pyx | 2 +- sklearn/feature_extraction/tests/test_feature_hasher.py | 2 +- sklearn/feature_extraction/text.py | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index 3305e6be2bbd9..eaca9dc78aea7 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -15,7 +15,7 @@ np.import_array() @cython.boundscheck(False) @cython.cdivision(True) -def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign): +def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1): """Guts of FeatureHasher.transform. Returns diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py index c76513d29bfc6..6ee58bf46ac60 100644 --- a/sklearn/feature_extraction/tests/test_feature_hasher.py +++ b/sklearn/feature_extraction/tests/test_feature_hasher.py @@ -109,8 +109,8 @@ def test_hasher_zeros(): def test_hasher_alternate_sign(): - X = [["foo", "bar", "baz", "investigation need", "records"]] # the last two tokens produce a hash collision that sums as 0 + X = [["foo", "bar", "baz", "investigation need", "records"]] Xt = FeatureHasher(alternate_sign=True, non_negative=False, input_type='string').fit_transform(X) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 8d1b79fd668fc..9eed61cfec5bd 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -409,6 +409,8 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin): approximately conserve the inner product in the hashed space even for small n_features. This approach is similar to sparse random projection. + .. versionadded:: 0.19 + non_negative : boolean, optional, default False When True, an absolute value is applied to the features matrix prior to returning it. When used in conjunction with alternate_sign=True, this