scikit-learn · ogrisel · Nov 18, 2012 · Jun 15, 2012 · Sep 4, 2012 · Sep 11, 2012
diff --git a/.gitattributes b/.gitattributes
@@ -4,6 +4,7 @@
 /sklearn/cluster/_k_means.c -diff
 /sklearn/datasets/_svmlight_format.c -diff
 /sklearn/ensemble/_gradient_boosting.c -diff
+/sklearn/feature_extraction/_hashing.c -diff
 /sklearn/linear_model/cd_fast.c -diff
 /sklearn/linear_model/sgd_fast.c -diff
 /sklearn/neighbors/ball_tree.c -diff

diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -281,6 +281,7 @@ Samples generator
    :template: class.rst
 
    feature_extraction.DictVectorizer
+   feature_extraction.FeatureHasher
 
 From images
 -----------

diff --git a/doc/modules/feature_extraction.rst b/doc/modules/feature_extraction.rst
@@ -102,6 +102,92 @@ memory the ``DictVectorizer`` class uses a ``scipy.sparse`` matrix by
 default instead of a ``numpy.ndarray``.
 
 
+.. _feature_hashing:
+
+Feature hashing
+===============
+
+.. currentmodule:: sklearn.feature_extraction
+
+The class :class:`FeatureHasher` is a high-speed, low-memory vectorizer that
+uses a technique known as
+`feature hashing <https://en.wikipedia.org/wiki/Feature_hashing>`_,
+or the "hashing trick".
+Instead of building a hash table of the features encountered in training,
+as the vectorizers do, instances of :class:`FeatureHasher`
+apply a hash function to the features
+to determine their column index in sample matrices directly.
+The result is increased speed and reduced memory usage,
+at the expense of inspectability;
+the hasher does not remember what the input features looked like
+and has no ``inverse_transform`` method.
+
+Since the hash function might cause collisions between (unrelated) features,
+a signed hash function is used and the sign of the hash value
+determines the sign of the value stored in the output matrix for a feature;
+this way, collisions are likely to cancel out rather than accumulate error,
+and the expected mean of any output feature's value is zero
+
+If ``non_negative=True`` is passed to the constructor,
+the absolute value is taken.
+This undoes some of the collision handling,
+but allows the output to be passed to estimators like :class:`MultinomialNB`
+or ``chi2`` feature selectors that expect non-negative inputs.
+
+:class:`FeatureHasher` accepts either mappings
+(like Python's ``dict`` and its variants in the ``collections`` module),
+``(feature, value)`` pairs, or strings,
+depending on the constructor parameter ``input_type``.
+Mapping are treated as lists of ``(feature, value)`` pairs,
+while single strings have an implicit value of 1.
+If a feature occurs multiple times in a sample, the values will be summed.
+Feature hashing can be employed in document classification,
+but unlike :class:`text.CountVectorizer`,
+:class:`FeatureHasher` does not do word
+splitting or any other preprocessing except Unicode-to-UTF-8 encoding.
+The output from :class:`FeatureHasher` is always a ``scipy.sparse`` matrix
+in the CSR format.
+
+As an example, consider a word-level natural language processing task
+that needs features extracted from ``(token, part_of_speech)`` pairs.
+One could use a Python generator function to extract features::
+
+  def token_features(token, part_of_speech):
+      if token.isdigit():
+          yield "numeric"
+      else:
+          yield "token={}".format(token.lower())
+          yield "token,pos={},{}".format(token, part_of_speech)
+      if token[0].isupper():
+          yield "uppercase_initial"
+      if token.isupper():
+          yield "all_uppercase"
+      yield "pos={}".format(part_of_speech)
+
+Then, the ``raw_X`` to be fed to ``FeatureHasher.transform``
+can be constructed using::
+
+  raw_X = (token_features(tok, pos_tagger(tok)) for tok in corpus)
+
+and fed to a hasher with::
+
+  hasher = FeatureHasher(input_type=string)
+  X = hasher.transform(raw_X)
+
+to get a ``scipy.sparse`` matrix ``X``.
+
+Note the use of a generator comprehension,
+which introduces laziness into the feature extraction:
+tokens are only processed on demand from the hasher.
+
+
+.. topic:: References:
+
+ * Kilian Weinberger, Anirban Dasgupta, John Langford, Alex Smola and
+   Josh Attenberg (2009). `Feature hashing for large scale multitask learning
+   <http://alex.smola.org/papers/2009/Weinbergeretal09.pdf>`_. Proc. ICML.
+
+
 .. _text_feature_extraction:
 
 Text feature extraction

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -58,7 +58,12 @@ Changelog
    - New estimator :class:`preprocessing.OneHotEncoder` to compute
      binary encodings of categorical features by `Andreas Müller`_.
 
-   - Faster implementation of :func:`metrics.precision_recall_curve` by Conrad Lee.
+   - Faster implementation of :func:`metrics.precision_recall_curve` by
+     Conrad Lee.
+
+   - New :class:`feature_extraction.FeatureHasher`, implementing the
+     "hashing trick" for fast, low-memory feature extraction from string data
+     by `Lars Buitinck`_.
 
 
 API changes summary

diff --git a/examples/hashing_vs_dict_vectorizer.py b/examples/hashing_vs_dict_vectorizer.py
@@ -0,0 +1,101 @@
+"""Compares FeatureHasher and DictVectorizer by using both to vectorize
+text documents.
+
+The example demonstrates syntax and speed only; it doesn't actually do
+anything useful with the extracted vectors. See the example scripts
+{document_classification_20newsgroups,clustering}.py for actual learning
+on text documents.
+
+A discrepancy between the number of terms reported for DictVectorizer and
+for FeatureHasher is to be expected due to hash collisions.
+"""
+
+# Author: Lars Buitinck <L.J.Buitinck@uva.nl>
+# License: 3-clause BSD
+
+from __future__ import print_function
+from collections import defaultdict
+import re
+import sys
+from time import time
+
+import numpy as np
+
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.feature_extraction import DictVectorizer, FeatureHasher
+
+
+def n_nonzero_columns(X):
+    """Returns the number of non-zero columns in a CSR matrix X."""
+    return len(np.unique(X.nonzero()[1]))
+
+
+def tokens(doc):
+    """Extract tokens from doc.
+
+    This uses a simple regex to break strings into tokens. For a more
+    principled approach, see CountVectorizer or TfidfVectorizer.
+    """
+    return (tok.lower() for tok in re.findall(r"\w+", doc))
+
+
+def token_freqs(doc):
+    """Extract a dict mapping tokens from doc to their frequencies."""
+    freq = defaultdict(int)
+    for tok in tokens(doc):
+        freq[tok] += 1
+    return freq
+
+
+categories = [
+    'alt.atheism',
+    'comp.graphics',
+    'comp.sys.ibm.pc.hardware',
+    'misc.forsale',
+    'rec.autos',
+    'sci.space',
+    'talk.religion.misc',
+]
+# Uncomment the following line to use a larger set (11k+ documents)
+#categories=None
+
+print(__doc__)
+print("Usage: %s [n_features_for_hashing]" % sys.argv[0])
+print("    The default number of features is 2**18.")
+print()
+
+try:
+    n_features = int(sys.argv[1])
+except IndexError:
+    n_features = 2 ** 18
+except ValueError:
+    print("not a valid number of features: %r" % sys.argv[1])
+    sys.exit(1)
+
+print("Loading 20 newsgroups training data")
+raw_data = fetch_20newsgroups(subset='train', categories=categories).data
+print("%d documents" % len(raw_data))
+print()
+
+print("DictVectorizer")
+t0 = time()
+vectorizer = DictVectorizer()
+vectorizer.fit_transform(token_freqs(d) for d in raw_data)
+print("done in %fs" % (time() - t0))
+print("Found %d unique terms" % len(vectorizer.get_feature_names()))
+print()
+
+print("FeatureHasher on frequency dicts")
+t0 = time()
+hasher = FeatureHasher(n_features=n_features)
+X = hasher.transform(token_freqs(d) for d in raw_data)
+print("done in %fs" % (time() - t0))
+print("Found %d unique terms" % n_nonzero_columns(X))
+print()
+
+print("FeatureHasher on raw tokens")
+t0 = time()
+hasher = FeatureHasher(n_features=n_features, input_type="string")
+X = hasher.transform(tokens(d) for d in raw_data)
+print("done in %fs" % (time() - t0))
+print("Found %d unique terms" % n_nonzero_columns(X))
diff --git a/sklearn/feature_extraction/__init__.py b/sklearn/feature_extraction/__init__.py
@@ -5,6 +5,7 @@
 """
 
 from .dict_vectorizer import DictVectorizer
+from .hashing import FeatureHasher
 from .image import img_to_graph, grid_to_graph
 from . import text