diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index fd12247043e00..3dbcac5340585 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -151,3 +151,5 @@ .. _Arthur Mensch: https://amensch.fr .. _Joris Van den Bossche: https://github.com/jorisvandenbossche + +.. _Roman Yurchak: https://github.com/rth diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index a397657ab1daa..3e49be9cf1d71 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -211,8 +211,16 @@ Feature Extraction throw an exception if ``max_patches`` was greater than or equal to the number of all possible patches rather than simply returning the number of possible patches. :issue:`10100` by :user:`Varun Agrawal ` + +- Fixed a bug in :class:`feature_extraction.text.CountVectorizer`, + :class:`feature_extraction.text.TfidfVectorizer`, + :class:`feature_extraction.text.HashingVectorizer` to support 64 bit sparse + array indexing necessary to process large datasets with more than 2·10⁹ tokens + (words or n-grams). :issue:`9147` by :user:`Claes-Fredrik Mannby ` + and `Roman Yurchak`_. + API changes summary ------------------- diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index e39aeafa08685..c462dd8a24719 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -1,6 +1,7 @@ # Author: Lars Buitinck # License: BSD 3 clause +import sys import array from cpython cimport array cimport cython @@ -9,6 +10,7 @@ cimport numpy as np import numpy as np from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32 +from sklearn.utils.fixes import sp_version np.import_array() @@ -33,12 +35,20 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1): cdef array.array indices cdef array.array indptr indices = array.array("i") - indptr = array.array("i", [0]) + if sys.version_info >= (3, 3): + indices_array_dtype = "q" + indices_np_dtype = np.longlong + else: + # On Windows with PY2.7 long int would still correspond to 32 bit. + indices_array_dtype = "l" + indices_np_dtype = np.int_ + + indptr = array.array(indices_array_dtype, [0]) # Since Python array does not understand Numpy dtypes, we grow the indices # and values arrays ourselves. Use a Py_ssize_t capacity for safety. cdef Py_ssize_t capacity = 8192 # arbitrary - cdef np.int32_t size = 0 + cdef np.int64_t size = 0 cdef np.ndarray values = np.empty(capacity, dtype=dtype) for x in raw_X: @@ -79,4 +89,18 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1): indptr[len(indptr) - 1] = size indices_a = np.frombuffer(indices, dtype=np.int32) - return (indices_a, np.frombuffer(indptr, dtype=np.int32), values[:size]) + indptr_a = np.frombuffer(indptr, dtype=indices_np_dtype) + + if indptr[-1] > 2147483648: # = 2**31 + if sp_version < (0, 14): + raise ValueError(('sparse CSR array has {} non-zero ' + 'elements and requires 64 bit indexing, ' + ' which is unsupported with scipy {}. ' + 'Please upgrade to scipy >=0.14') + .format(indptr[-1], '.'.join(sp_version))) + # both indices and indptr have the same dtype in CSR arrays + indices_a = indices_a.astype(np.int64) + else: + indptr_a = indptr_a.astype(np.int32) + + return (indices_a, indptr_a, values[:size]) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 417aeef2f8bc2..a1e0845abe9ac 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -30,6 +30,7 @@ from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS from ..utils.validation import check_is_fitted +from ..utils.fixes import sp_version __all__ = ['CountVectorizer', 'ENGLISH_STOP_WORDS', @@ -784,7 +785,8 @@ def _count_vocab(self, raw_documents, fixed_vocab): analyze = self.build_analyzer() j_indices = [] - indptr = _make_int_array() + indptr = [] + values = _make_int_array() indptr.append(0) for doc in raw_documents: @@ -811,8 +813,20 @@ def _count_vocab(self, raw_documents, fixed_vocab): raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") - j_indices = np.asarray(j_indices, dtype=np.intc) - indptr = np.frombuffer(indptr, dtype=np.intc) + if indptr[-1] > 2147483648: # = 2**31 - 1 + if sp_version >= (0, 14): + indices_dtype = np.int64 + else: + raise ValueError(('sparse CSR array has {} non-zero ' + 'elements and requires 64 bit indexing, ' + ' which is unsupported with scipy {}. ' + 'Please upgrade to scipy >=0.14') + .format(indptr[-1], '.'.join(sp_version))) + + else: + indices_dtype = np.int32 + j_indices = np.asarray(j_indices, dtype=indices_dtype) + indptr = np.asarray(indptr, dtype=indices_dtype) values = np.frombuffer(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr),