scikit-learn · jnothman · Jun 8, 2017 · Oct 3, 2016 · Oct 3, 2016 · Oct 3, 2016
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -284,9 +284,18 @@ Bug fixes
      left `coef_` as a list, rather than an ndarray.
      :issue:`8160` by :user:`CJ Carey <perimosocordiae>`.
 
+
+   - Fix a bug where :class:`sklearn.feature_extraction.FeatureHasher`
+     mandatorily applied a sparse random projection to the hashed features,
+     preventing the use of 
+     :class:`sklearn.feature_extraction.text.HashingVectorizer` in a
+     pipeline with  :class:`sklearn.feature_extraction.text.TfidfTransformer`.
+     :issue:`7513` by :user:`Roman Yurchak <rth>`.
+
    - Fix a bug in cases where `numpy.cumsum` may be numerically unstable,
      raising an exception if instability is identified.  :issue:`7376` and
      :issue:`7331` by `Joel Nothman`_ and :user:`yangarbiter`.
+
    - Fix a bug where :meth:`sklearn.base.BaseEstimator.__getstate__`
      obstructed pickling customizations of child-classes, when used in a
      multiple inheritance context.

diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx
@@ -15,7 +15,7 @@ np.import_array()
 
 @cython.boundscheck(False)
 @cython.cdivision(True)
-def transform(raw_X, Py_ssize_t n_features, dtype):
+def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
     """Guts of FeatureHasher.transform.
 
     Returns
@@ -63,7 +63,9 @@ def transform(raw_X, Py_ssize_t n_features, dtype):
 
             array.resize_smart(indices, len(indices) + 1)
             indices[len(indices) - 1] = abs(h) % n_features
-            value *= (h >= 0) * 2 - 1
+            # improve inner product preservation in the hashed space
+            if alternate_sign:
+                value *= (h >= 0) * 2 - 1
             values[size] = value
             size += 1
 

diff --git a/sklearn/feature_extraction/hashing.py b/sklearn/feature_extraction/hashing.py
@@ -2,6 +2,7 @@
 # License: BSD 3 clause
 
 import numbers
+import warnings
 
 import numpy as np
 import scipy.sparse as sp
@@ -53,11 +54,17 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
         The feature_name is hashed to find the appropriate column for the
         feature. The value's sign might be flipped in the output (but see
         non_negative, below).
+    alternate_sign : boolean, optional, default True
+        When True, an alternating sign is added to the features as to
+        approximately conserve the inner product in the hashed space even for
+        small n_features. This approach is similar to sparse random projection.
     non_negative : boolean, optional, default False
-        Whether output matrices should contain non-negative values only;
-        effectively calls abs on the matrix prior to returning it.
-        When True, output values can be interpreted as frequencies.
-        When False, output values will have expected value zero.
+        When True, an absolute value is applied to the features matrix prior to
+        returning it. When used in conjunction with alternate_sign=True, this
+        significantly reduces the inner product preservation property.
+        .. deprecated:: 0.19
+            This option will be removed in 0.21.
+
 
     Examples
     --------
@@ -77,12 +84,17 @@ class FeatureHasher(BaseEstimator, TransformerMixin):
     """
 
     def __init__(self, n_features=(2 ** 20), input_type="dict",
-                 dtype=np.float64, non_negative=False):
+                 dtype=np.float64, alternate_sign=True, non_negative=False):
         self._validate_params(n_features, input_type)
+        if non_negative:
+            warnings.warn("the option non_negative=True has been deprecated"
+                          " in 0.19 and will be removed"
+                          " in version 0.21.", DeprecationWarning)
 
         self.dtype = dtype
         self.input_type = input_type
         self.n_features = n_features
+        self.alternate_sign = alternate_sign
         self.non_negative = non_negative
 
     @staticmethod
@@ -139,7 +151,8 @@ def transform(self, raw_X, y=None):
         elif self.input_type == "string":
             raw_X = (((f, 1) for f in x) for x in raw_X)
         indices, indptr, values = \
-            _hashing.transform(raw_X, self.n_features, self.dtype)
+            _hashing.transform(raw_X, self.n_features, self.dtype,
+                               self.alternate_sign)
         n_samples = indptr.shape[0] - 1
 
         if n_samples == 0:
@@ -148,6 +161,7 @@ def transform(self, raw_X, y=None):
         X = sp.csr_matrix((values, indices, indptr), dtype=self.dtype,
                           shape=(n_samples, self.n_features))
         X.sum_duplicates()  # also sorts the indices
+
         if self.non_negative:
             np.abs(X.data, X.data)
         return X
diff --git a/sklearn/feature_extraction/tests/test_feature_hasher.py b/sklearn/feature_extraction/tests/test_feature_hasher.py
@@ -4,7 +4,8 @@
 from numpy.testing import assert_array_equal
 
 from sklearn.feature_extraction import FeatureHasher
-from sklearn.utils.testing import assert_raises, assert_true, assert_equal
+from sklearn.utils.testing import (assert_raises, assert_true, assert_equal,
+                                   ignore_warnings)
 
 
 def test_feature_hasher_dicts():
@@ -106,3 +107,46 @@ def test_hasher_zeros():
     # Assert that no zeros are materialized in the output.
     X = FeatureHasher().transform([{'foo': 0}])
     assert_equal(X.data.shape, (0,))
+
+
+@ignore_warnings(category=DeprecationWarning)
+def test_hasher_alternate_sign():
+    # the last two tokens produce a hash collision that sums as 0
+    X = [["foo", "bar", "baz", "investigation need", "records"]]
+
+    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
+                       input_type='string').fit_transform(X)
+    assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
+    # check that we have a collision that produces a 0 count
+    assert_true(len(Xt.data) < len(X[0]))
+    assert_true((Xt.data == 0.).any())
+
+    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
+                       input_type='string').fit_transform(X)
+    assert_true((Xt.data >= 0).all())   # all counts are positive
+    assert_true((Xt.data == 0.).any())  # we still have a collision
+    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
+                       input_type='string').fit_transform(X)
+    assert_true((Xt.data > 0).all())    # strictly positive counts
+    Xt_2 = FeatureHasher(alternate_sign=False, non_negative=False,
+                         input_type='string').fit_transform(X)
+    # With initially positive features, the non_negative option should
+    # have no impact when alternate_sign=False
+    assert_array_equal(Xt.data, Xt_2.data)
+
+
+@ignore_warnings(category=DeprecationWarning)
+def test_hasher_negative():
+    X = [{"foo": 2, "bar": -4, "baz": -1}.items()]
+    Xt = FeatureHasher(alternate_sign=False, non_negative=False,
+                       input_type="pair").fit_transform(X)
+    assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
+    Xt = FeatureHasher(alternate_sign=False, non_negative=True,
+                       input_type="pair").fit_transform(X)
+    assert_true(Xt.data.min() > 0)
+    Xt = FeatureHasher(alternate_sign=True, non_negative=False,
+                       input_type="pair").fit_transform(X)
+    assert_true(Xt.data.min() < 0 and Xt.data.max() > 0)
+    Xt = FeatureHasher(alternate_sign=True, non_negative=True,
+                       input_type="pair").fit_transform(X)
+    assert_true(Xt.data.min() > 0)
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -404,11 +404,20 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
     dtype : type, optional
         Type of the matrix returned by fit_transform() or transform().
 
-    non_negative : boolean, default=False
-        Whether output matrices should contain non-negative values only;
-        effectively calls abs on the matrix prior to returning it.
-        When True, output values can be interpreted as frequencies.
-        When False, output values will have expected value zero.
+    alternate_sign : boolean, optional, default True
+        When True, an alternating sign is added to the features as to
+        approximately conserve the inner product in the hashed space even for
+        small n_features. This approach is similar to sparse random projection.
+
+        .. versionadded:: 0.19
+
+    non_negative : boolean, optional, default False
+        When True, an absolute value is applied to the features matrix prior to
+        returning it. When used in conjunction with alternate_sign=True, this
+        significantly reduces the inner product preservation property.
+
+        .. deprecated:: 0.19
+            This option will be removed in 0.21.
 
     See also
     --------
@@ -420,8 +429,8 @@ def __init__(self, input='content', encoding='utf-8',
                  lowercase=True, preprocessor=None, tokenizer=None,
                  stop_words=None, token_pattern=r"(?u)\b\w\w+\b",
                  ngram_range=(1, 1), analyzer='word', n_features=(2 ** 20),
-                 binary=False, norm='l2', non_negative=False,
-                 dtype=np.float64):
+                 binary=False, norm='l2', alternate_sign=True,
+                 non_negative=False, dtype=np.float64):
         self.input = input
         self.encoding = encoding
         self.decode_error = decode_error
@@ -436,6 +445,7 @@ def __init__(self, input='content', encoding='utf-8',
         self.ngram_range = ngram_range
         self.binary = binary
         self.norm = norm
+        self.alternate_sign = alternate_sign
         self.non_negative = non_negative
         self.dtype = dtype
 
@@ -496,6 +506,7 @@ def transform(self, X, y=None):
     def _get_hasher(self):
         return FeatureHasher(n_features=self.n_features,
                              input_type='string', dtype=self.dtype,
+                             alternate_sign=self.alternate_sign,
                              non_negative=self.non_negative)