scikit-learn · jnothman · Nov 29, 2017 · Jan 19, 2016 · Jan 20, 2016 · Jan 28, 2016
diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst
@@ -151,3 +151,5 @@
 .. _Arthur Mensch: https://amensch.fr
 
 .. _Joris Van den Bossche: https://github.com/jorisvandenbossche
+
+.. _Roman Yurchak: https://github.com/rth
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -211,8 +211,16 @@ Feature Extraction
   throw an exception if ``max_patches`` was greater than or equal to the number
   of all possible patches rather than simply returning the number of possible
   patches. :issue:`10100` by :user:`Varun Agrawal <varunagrawal>`
+
+- Fixed a bug in :class:`feature_extraction.text.CountVectorizer`,
+  :class:`feature_extraction.text.TfidfVectorizer`,
+  :class:`feature_extraction.text.HashingVectorizer` to support 64 bit sparse
+  array indexing necessary to process large datasets with more than 2·10⁹ tokens
+  (words or n-grams). :issue:`9147` by :user:`Claes-Fredrik Mannby <mannby>`
+  and `Roman Yurchak`_.
 
 
+
 API changes summary
 -------------------
 

diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx
@@ -1,6 +1,7 @@
 # Author: Lars Buitinck
 # License: BSD 3 clause
 
+import sys
 import array
 from cpython cimport array
 cimport cython
@@ -9,6 +10,7 @@ cimport numpy as np
 import numpy as np
 
 from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32
+from sklearn.utils.fixes import sp_version
 
 np.import_array()
 
@@ -33,12 +35,20 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
     cdef array.array indices
     cdef array.array indptr
     indices = array.array("i")
-    indptr = array.array("i", [0])
+    if sys.version_info >= (3, 3):
+        indices_array_dtype = "q"
+        indices_np_dtype = np.longlong
+    else:
+        # On Windows with PY2.7 long int would still correspond to 32 bit. 
+        indices_array_dtype = "l"
+        indices_np_dtype = np.int_
+
+    indptr = array.array(indices_array_dtype, [0])
 
     # Since Python array does not understand Numpy dtypes, we grow the indices
     # and values arrays ourselves. Use a Py_ssize_t capacity for safety.
     cdef Py_ssize_t capacity = 8192     # arbitrary
-    cdef np.int32_t size = 0
+    cdef np.int64_t size = 0
     cdef np.ndarray values = np.empty(capacity, dtype=dtype)
 
     for x in raw_X:
@@ -79,4 +89,18 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
         indptr[len(indptr) - 1] = size
 
     indices_a = np.frombuffer(indices, dtype=np.int32)
-    return (indices_a, np.frombuffer(indptr, dtype=np.int32), values[:size])
+    indptr_a = np.frombuffer(indptr, dtype=indices_np_dtype)
+
+    if indptr[-1] > 2147483648:  # = 2**31
+        if sp_version < (0, 14):
+            raise ValueError(('sparse CSR array has {} non-zero '
+                              'elements and requires 64 bit indexing, '
+                              ' which is unsupported with scipy {}. '
+                              'Please upgrade to scipy >=0.14')
+                             .format(indptr[-1], '.'.join(sp_version)))
+        # both indices and indptr have the same dtype in CSR arrays
+        indices_a = indices_a.astype(np.int64)
+    else:
+        indptr_a = indptr_a.astype(np.int32)
+
+    return (indices_a, indptr_a, values[:size])
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -30,6 +30,7 @@
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted
+from ..utils.fixes import sp_version
 
 __all__ = ['CountVectorizer',
            'ENGLISH_STOP_WORDS',
@@ -784,7 +785,8 @@ def _count_vocab(self, raw_documents, fixed_vocab):
 
         analyze = self.build_analyzer()
         j_indices = []
-        indptr = _make_int_array()
+        indptr = []
+
         values = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:
@@ -811,8 +813,20 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        j_indices = np.asarray(j_indices, dtype=np.intc)
-        indptr = np.frombuffer(indptr, dtype=np.intc)
+        if indptr[-1] > 2147483648:  # = 2**31 - 1
+            if sp_version >= (0, 14):
+                indices_dtype = np.int64
+            else:
+                raise ValueError(('sparse CSR array has {} non-zero '
+                                  'elements and requires 64 bit indexing, '
+                                  ' which is unsupported with scipy {}. '
+                                  'Please upgrade to scipy >=0.14')
+                                 .format(indptr[-1], '.'.join(sp_version)))
+
+        else:
+            indices_dtype = np.int32
+        j_indices = np.asarray(j_indices, dtype=indices_dtype)
+        indptr = np.asarray(indptr, dtype=indices_dtype)
         values = np.frombuffer(values, dtype=np.intc)
 
         X = sp.csr_matrix((values, j_indices, indptr),
Original file line number	Diff line number	Diff line change
Expand Up		@@ -151,3 +151,5 @@
		.. _Arthur Mensch: https://amensch.fr

		.. _Joris Van den Bossche: https://github.com/jorisvandenbossche

		.. _Roman Yurchak: https://github.com/rth