scikit-learn · mannby · Jan 19, 2016 · Jan 20, 2016 · Jan 28, 2016
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -30,7 +30,7 @@
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
 from ..utils import deprecated
-from ..utils.fixes import frombuffer_empty, bincount
+from ..utils.fixes import frombuffer_empty, bincount, sp_version
 from ..utils.validation import check_is_fitted
 
 __all__ = ['CountVectorizer',
@@ -744,8 +744,19 @@ def _count_vocab(self, raw_documents, fixed_vocab):
             vocabulary.default_factory = vocabulary.__len__
 
         analyze = self.build_analyzer()
-        j_indices = _make_int_array()
-        indptr = _make_int_array()
+        if sp_version >= (0, 14):
+            # We can use 64-bit indices
+            # NOTE: long on Windows is only 32 bits
+            # j_indices stores feature indices, likely to be < 2^31
+            j_indices = _make_long_array()
+            # indptr stores indices into j_indices, which can be large
+            indptr = _make_long_array()
+        else:
+            # Sparse arrays only support 32-bit integers
+            # j_indices stores feature indices, likely to be < 2^31
+            j_indices = _make_int_array()
+            # indptr stores indices into j_indices, which can be large
+            indptr = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:
             for feature in analyze(doc):
@@ -763,8 +774,16 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        j_indices = frombuffer_empty(j_indices, dtype=np.intc)
-        indptr = np.frombuffer(indptr, dtype=np.intc)
+        if sp_version >= (0, 14):
+            # We can use 64-bit indices
+            # int_ == "l" (long)
+            # NOTE: long on Windows is only 32 bits
+            j_indices = frombuffer_empty(j_indices, dtype=np.int_)
+            indptr = np.frombuffer(indptr, dtype=np.int_)
+        else:
+            # Sparse arrays only support 32-bit integers
+            j_indices = frombuffer_empty(j_indices, dtype=np.intc)
+            indptr = np.frombuffer(indptr, dtype=np.intc)
         values = np.ones(len(j_indices))
 
         X = sp.csr_matrix((values, j_indices, indptr),
@@ -909,6 +928,15 @@ def _make_int_array():
     """Construct an array.array of a type suitable for scipy.sparse indices."""
     return array.array(str("i"))
 
+def _make_long_array():
+    """Construct an array.array of a type suitable for large scipy.sparse indices.
+
+    scipy 0.14 and later can construct sparse matrices with 64 bit integer indices.
+
+    NOTE: long on Windows is only 32 bits
+    """
+    return array.array(str("l"))
+
 
 class TfidfTransformer(BaseEstimator, TransformerMixin):
     """Transform a count matrix to a normalized tf or tf-idf representation