From d0787ded159162fea28d493ff1ad10e1eb3c7de1 Mon Sep 17 00:00:00 2001
From: Claes-Fredrik Mannby <claes-fredrik.mannby@nuance.com>
Date: Tue, 19 Jan 2016 13:08:27 -0800
Subject: [PATCH 1/6] Support new scipy sparse array indices, which can now be
 > 2^31 (< 2^63). This is needed for very large training sets. Feature indices
 (based on the number of distinct features), are unlikely to need 4 bytes per
 value, however.

---
 sklearn/feature_extraction/text.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 417aeef2f8bc2..83d7be6a08e97 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -784,8 +784,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
 
         analyze = self.build_analyzer()
         j_indices = []
-        indptr = _make_int_array()
-        values = _make_int_array()
+        indptr = _make_long_array()
         indptr.append(0)
         for doc in raw_documents:
             feature_counter = {}
@@ -967,6 +966,10 @@ def _make_int_array():
     """Construct an array.array of a type suitable for scipy.sparse indices."""
     return array.array(str("i"))
 
+def _make_long_array():
+    """Construct an array.array of a type suitable for scipy.sparse indices (which now support 64-bit signed integers)."""
+    return array.array(str("l"))
+
 
 class TfidfTransformer(BaseEstimator, TransformerMixin):
     """Transform a count matrix to a normalized tf or tf-idf representation

From 200fac06a2f904b5716182add02609f07acba88c Mon Sep 17 00:00:00 2001
From: Claes-Fredrik Mannby <claes-fredrik.mannby@nuance.com>
Date: Wed, 20 Jan 2016 14:13:12 -0800
Subject: [PATCH 2/6] Also increase size of integer values in indptr in the
 next step.

---
 sklearn/feature_extraction/text.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 83d7be6a08e97..95709e3292b93 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -811,7 +811,7 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                                  " contain stop words")
 
         j_indices = np.asarray(j_indices, dtype=np.intc)
-        indptr = np.frombuffer(indptr, dtype=np.intc)
+        indptr = np.frombuffer(indptr, dtype=np.int_)
         values = np.frombuffer(values, dtype=np.intc)
 
         X = sp.csr_matrix((values, j_indices, indptr),

From 1278bbc6c4b7e7a74f9db39cc5d9faaa25953933 Mon Sep 17 00:00:00 2001
From: Claes-Fredrik Mannby <claes-fredrik.mannby@nuance.com>
Date: Thu, 28 Jan 2016 11:28:20 -0800
Subject: [PATCH 3/6] Use long for both arrays if scipy >= 0.14. Tweak comments

---
 sklearn/feature_extraction/text.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 95709e3292b93..8c84832d24509 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -784,7 +784,15 @@ def _count_vocab(self, raw_documents, fixed_vocab):
 
         analyze = self.build_analyzer()
         j_indices = []
-        indptr = _make_long_array()
+        if sp_version >= (0, 14):
+            # We can use 64-bit indices
+            # NOTE: long on Windows is only 32 bits
+            # indptr stores indices into j_indices, which can be large
+            indptr = _make_long_array()
+        else:
+            # Sparse arrays only support 32-bit integers
+            # j_indices stores feature indices, likely to be < 2^31
+            indptr = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:
             feature_counter = {}
@@ -967,7 +975,12 @@ def _make_int_array():
     return array.array(str("i"))
 
 def _make_long_array():
-    """Construct an array.array of a type suitable for scipy.sparse indices (which now support 64-bit signed integers)."""
+    """Construct an array.array of a type suitable for large scipy.sparse indices.
+
+    scipy 0.14 and later can construct sparse matrices with 64 bit integer indices.
+
+    NOTE: long on Windows is only 32 bits
+    """
     return array.array(str("l"))
 
 

From 564f8b77e4e6a0ef4e5ec9e942f2169c17b48254 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Thu, 15 Jun 2017 14:34:07 +0000
Subject: [PATCH 4/6] Reusing make_int_array function

---
 sklearn/feature_extraction/text.py | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 8c84832d24509..9c1c4776855b3 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -784,15 +784,9 @@ def _count_vocab(self, raw_documents, fixed_vocab):
 
         analyze = self.build_analyzer()
         j_indices = []
-        if sp_version >= (0, 14):
-            # We can use 64-bit indices
-            # NOTE: long on Windows is only 32 bits
-            # indptr stores indices into j_indices, which can be large
-            indptr = _make_long_array()
-        else:
-            # Sparse arrays only support 32-bit integers
-            # j_indices stores feature indices, likely to be < 2^31
-            indptr = _make_int_array()
+        # indptr stores indices into j_indices, which can be large
+        indptr = _make_int_array(dtype='l')
+        values = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:
             feature_counter = {}
@@ -970,18 +964,12 @@ def get_feature_names(self):
                                      key=itemgetter(1))]
 
 
-def _make_int_array():
-    """Construct an array.array of a type suitable for scipy.sparse indices."""
-    return array.array(str("i"))
-
-def _make_long_array():
-    """Construct an array.array of a type suitable for large scipy.sparse indices.
-
-    scipy 0.14 and later can construct sparse matrices with 64 bit integer indices.
+def _make_int_array(dtype='i'):
+    """Construct an array.array of a type suitable for scipy.sparse indices.
 
     NOTE: long on Windows is only 32 bits
     """
-    return array.array(str("l"))
+    return array.array(str(dtype))
 
 
 class TfidfTransformer(BaseEstimator, TransformerMixin):

From f6a7d0df4917ecee5fbf14d75478bfdd8935dd37 Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Wed, 22 Nov 2017 16:35:40 +0100
Subject: [PATCH 5/6] Rewrite the 64 bit index support of CSR arrays

---
 sklearn/feature_extraction/_hashing.pyx | 30 ++++++++++++++++++++++---
 sklearn/feature_extraction/text.py      | 30 ++++++++++++++++---------
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx
index e39aeafa08685..c462dd8a24719 100644
--- a/sklearn/feature_extraction/_hashing.pyx
+++ b/sklearn/feature_extraction/_hashing.pyx
@@ -1,6 +1,7 @@
 # Author: Lars Buitinck
 # License: BSD 3 clause
 
+import sys
 import array
 from cpython cimport array
 cimport cython
@@ -9,6 +10,7 @@ cimport numpy as np
 import numpy as np
 
 from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32
+from sklearn.utils.fixes import sp_version
 
 np.import_array()
 
@@ -33,12 +35,20 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
     cdef array.array indices
     cdef array.array indptr
     indices = array.array("i")
-    indptr = array.array("i", [0])
+    if sys.version_info >= (3, 3):
+        indices_array_dtype = "q"
+        indices_np_dtype = np.longlong
+    else:
+        # On Windows with PY2.7 long int would still correspond to 32 bit. 
+        indices_array_dtype = "l"
+        indices_np_dtype = np.int_
+
+    indptr = array.array(indices_array_dtype, [0])
 
     # Since Python array does not understand Numpy dtypes, we grow the indices
     # and values arrays ourselves. Use a Py_ssize_t capacity for safety.
     cdef Py_ssize_t capacity = 8192     # arbitrary
-    cdef np.int32_t size = 0
+    cdef np.int64_t size = 0
     cdef np.ndarray values = np.empty(capacity, dtype=dtype)
 
     for x in raw_X:
@@ -79,4 +89,18 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1):
         indptr[len(indptr) - 1] = size
 
     indices_a = np.frombuffer(indices, dtype=np.int32)
-    return (indices_a, np.frombuffer(indptr, dtype=np.int32), values[:size])
+    indptr_a = np.frombuffer(indptr, dtype=indices_np_dtype)
+
+    if indptr[-1] > 2147483648:  # = 2**31
+        if sp_version < (0, 14):
+            raise ValueError(('sparse CSR array has {} non-zero '
+                              'elements and requires 64 bit indexing, '
+                              ' which is unsupported with scipy {}. '
+                              'Please upgrade to scipy >=0.14')
+                             .format(indptr[-1], '.'.join(sp_version)))
+        # both indices and indptr have the same dtype in CSR arrays
+        indices_a = indices_a.astype(np.int64)
+    else:
+        indptr_a = indptr_a.astype(np.int32)
+
+    return (indices_a, indptr_a, values[:size])
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
index 9c1c4776855b3..a1e0845abe9ac 100644
--- a/sklearn/feature_extraction/text.py
+++ b/sklearn/feature_extraction/text.py
@@ -30,6 +30,7 @@
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
 from ..utils.validation import check_is_fitted
+from ..utils.fixes import sp_version
 
 __all__ = ['CountVectorizer',
            'ENGLISH_STOP_WORDS',
@@ -784,8 +785,8 @@ def _count_vocab(self, raw_documents, fixed_vocab):
 
         analyze = self.build_analyzer()
         j_indices = []
-        # indptr stores indices into j_indices, which can be large
-        indptr = _make_int_array(dtype='l')
+        indptr = []
+
         values = _make_int_array()
         indptr.append(0)
         for doc in raw_documents:
@@ -812,8 +813,20 @@ def _count_vocab(self, raw_documents, fixed_vocab):
                 raise ValueError("empty vocabulary; perhaps the documents only"
                                  " contain stop words")
 
-        j_indices = np.asarray(j_indices, dtype=np.intc)
-        indptr = np.frombuffer(indptr, dtype=np.int_)
+        if indptr[-1] > 2147483648:  # = 2**31 - 1
+            if sp_version >= (0, 14):
+                indices_dtype = np.int64
+            else:
+                raise ValueError(('sparse CSR array has {} non-zero '
+                                  'elements and requires 64 bit indexing, '
+                                  ' which is unsupported with scipy {}. '
+                                  'Please upgrade to scipy >=0.14')
+                                 .format(indptr[-1], '.'.join(sp_version)))
+
+        else:
+            indices_dtype = np.int32
+        j_indices = np.asarray(j_indices, dtype=indices_dtype)
+        indptr = np.asarray(indptr, dtype=indices_dtype)
         values = np.frombuffer(values, dtype=np.intc)
 
         X = sp.csr_matrix((values, j_indices, indptr),
@@ -964,12 +977,9 @@ def get_feature_names(self):
                                      key=itemgetter(1))]
 
 
-def _make_int_array(dtype='i'):
-    """Construct an array.array of a type suitable for scipy.sparse indices.
-
-    NOTE: long on Windows is only 32 bits
-    """
-    return array.array(str(dtype))
+def _make_int_array():
+    """Construct an array.array of a type suitable for scipy.sparse indices."""
+    return array.array(str("i"))
 
 
 class TfidfTransformer(BaseEstimator, TransformerMixin):

From b230cfdd2ada3bd1d20667d0dd16080435886c6f Mon Sep 17 00:00:00 2001
From: Roman Yurchak <rth.yurchak@gmail.com>
Date: Wed, 29 Nov 2017 16:27:52 +0100
Subject: [PATCH 6/6] Add what's new entry

---
 doc/whats_new/_contributors.rst | 2 ++
 doc/whats_new/v0.20.rst         | 9 +++++++++
 2 files changed, 11 insertions(+)

diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst
index fd12247043e00..3dbcac5340585 100644
--- a/doc/whats_new/_contributors.rst
+++ b/doc/whats_new/_contributors.rst
@@ -151,3 +151,5 @@
 .. _Arthur Mensch: https://amensch.fr
 
 .. _Joris Van den Bossche: https://github.com/jorisvandenbossche
+
+.. _Roman Yurchak: https://github.com/rth
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
index a916a9bbd644a..9fe6c753ab27c 100644
--- a/doc/whats_new/v0.20.rst
+++ b/doc/whats_new/v0.20.rst
@@ -197,6 +197,15 @@ Neighbors
   warning when no neighbors are found for samples.  :issue:`9655` by
   :user:`Andreas Bjerre-Nielsen <abjer>`.
 
+Feature extraction and preprocessing
+
+- Fixed a bug in :class:`feature_extraction.text.CountVectorizer`,
+  :class:`feature_extraction.text.TfidfVectorizer`,
+  :class:`feature_extraction.text.HashingVectorizer` to support 64 bit sparse
+  array indexing necessary to process large datasets with more than 2·10⁹ tokens
+  (words or n-grams). :issue:`9147` by :user:`Claes-Fredrik Mannby <mannby>`
+  and `Roman Yurchak`_.
+
 API changes summary
 -------------------