From d0787ded159162fea28d493ff1ad10e1eb3c7de1 Mon Sep 17 00:00:00 2001 From: Claes-Fredrik Mannby Date: Tue, 19 Jan 2016 13:08:27 -0800 Subject: [PATCH 1/6] Support new scipy sparse array indices, which can now be > 2^31 (< 2^63). This is needed for very large training sets. Feature indices (based on the number of distinct features), are unlikely to need 4 bytes per value, however. --- sklearn/feature_extraction/text.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 417aeef2f8bc2..83d7be6a08e97 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -784,8 +784,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): analyze = self.build_analyzer() j_indices = [] - indptr = _make_int_array() - values = _make_int_array() + indptr = _make_long_array() indptr.append(0) for doc in raw_documents: feature_counter = {} @@ -967,6 +966,10 @@ def _make_int_array(): """Construct an array.array of a type suitable for scipy.sparse indices.""" return array.array(str("i")) +def _make_long_array(): + """Construct an array.array of a type suitable for scipy.sparse indices (which now support 64-bit signed integers).""" + return array.array(str("l")) + class TfidfTransformer(BaseEstimator, TransformerMixin): """Transform a count matrix to a normalized tf or tf-idf representation From 200fac06a2f904b5716182add02609f07acba88c Mon Sep 17 00:00:00 2001 From: Claes-Fredrik Mannby Date: Wed, 20 Jan 2016 14:13:12 -0800 Subject: [PATCH 2/6] Also increase size of integer values in indptr in the next step. --- sklearn/feature_extraction/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 83d7be6a08e97..95709e3292b93 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -811,7 +811,7 @@ def _count_vocab(self, raw_documents, fixed_vocab): " contain stop words") j_indices = np.asarray(j_indices, dtype=np.intc) - indptr = np.frombuffer(indptr, dtype=np.intc) + indptr = np.frombuffer(indptr, dtype=np.int_) values = np.frombuffer(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), From 1278bbc6c4b7e7a74f9db39cc5d9faaa25953933 Mon Sep 17 00:00:00 2001 From: Claes-Fredrik Mannby Date: Thu, 28 Jan 2016 11:28:20 -0800 Subject: [PATCH 3/6] Use long for both arrays if scipy >= 0.14. Tweak comments --- sklearn/feature_extraction/text.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 95709e3292b93..8c84832d24509 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -784,7 +784,15 @@ def _count_vocab(self, raw_documents, fixed_vocab): analyze = self.build_analyzer() j_indices = [] - indptr = _make_long_array() + if sp_version >= (0, 14): + # We can use 64-bit indices + # NOTE: long on Windows is only 32 bits + # indptr stores indices into j_indices, which can be large + indptr = _make_long_array() + else: + # Sparse arrays only support 32-bit integers + # j_indices stores feature indices, likely to be < 2^31 + indptr = _make_int_array() indptr.append(0) for doc in raw_documents: feature_counter = {} @@ -967,7 +975,12 @@ def _make_int_array(): return array.array(str("i")) def _make_long_array(): - """Construct an array.array of a type suitable for scipy.sparse indices (which now support 64-bit signed integers).""" + """Construct an array.array of a type suitable for large scipy.sparse indices. + + scipy 0.14 and later can construct sparse matrices with 64 bit integer indices. + + NOTE: long on Windows is only 32 bits + """ return array.array(str("l")) From 564f8b77e4e6a0ef4e5ec9e942f2169c17b48254 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Thu, 15 Jun 2017 14:34:07 +0000 Subject: [PATCH 4/6] Reusing make_int_array function --- sklearn/feature_extraction/text.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 8c84832d24509..9c1c4776855b3 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -784,15 +784,9 @@ def _count_vocab(self, raw_documents, fixed_vocab): analyze = self.build_analyzer() j_indices = [] - if sp_version >= (0, 14): - # We can use 64-bit indices - # NOTE: long on Windows is only 32 bits - # indptr stores indices into j_indices, which can be large - indptr = _make_long_array() - else: - # Sparse arrays only support 32-bit integers - # j_indices stores feature indices, likely to be < 2^31 - indptr = _make_int_array() + # indptr stores indices into j_indices, which can be large + indptr = _make_int_array(dtype='l') + values = _make_int_array() indptr.append(0) for doc in raw_documents: feature_counter = {} @@ -970,18 +964,12 @@ def get_feature_names(self): key=itemgetter(1))] -def _make_int_array(): - """Construct an array.array of a type suitable for scipy.sparse indices.""" - return array.array(str("i")) - -def _make_long_array(): - """Construct an array.array of a type suitable for large scipy.sparse indices. - - scipy 0.14 and later can construct sparse matrices with 64 bit integer indices. +def _make_int_array(dtype='i'): + """Construct an array.array of a type suitable for scipy.sparse indices. NOTE: long on Windows is only 32 bits """ - return array.array(str("l")) + return array.array(str(dtype)) class TfidfTransformer(BaseEstimator, TransformerMixin): From f6a7d0df4917ecee5fbf14d75478bfdd8935dd37 Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 22 Nov 2017 16:35:40 +0100 Subject: [PATCH 5/6] Rewrite the 64 bit index support of CSR arrays --- sklearn/feature_extraction/_hashing.pyx | 30 ++++++++++++++++++++++--- sklearn/feature_extraction/text.py | 30 ++++++++++++++++--------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/sklearn/feature_extraction/_hashing.pyx b/sklearn/feature_extraction/_hashing.pyx index e39aeafa08685..c462dd8a24719 100644 --- a/sklearn/feature_extraction/_hashing.pyx +++ b/sklearn/feature_extraction/_hashing.pyx @@ -1,6 +1,7 @@ # Author: Lars Buitinck # License: BSD 3 clause +import sys import array from cpython cimport array cimport cython @@ -9,6 +10,7 @@ cimport numpy as np import numpy as np from sklearn.utils.murmurhash cimport murmurhash3_bytes_s32 +from sklearn.utils.fixes import sp_version np.import_array() @@ -33,12 +35,20 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1): cdef array.array indices cdef array.array indptr indices = array.array("i") - indptr = array.array("i", [0]) + if sys.version_info >= (3, 3): + indices_array_dtype = "q" + indices_np_dtype = np.longlong + else: + # On Windows with PY2.7 long int would still correspond to 32 bit. + indices_array_dtype = "l" + indices_np_dtype = np.int_ + + indptr = array.array(indices_array_dtype, [0]) # Since Python array does not understand Numpy dtypes, we grow the indices # and values arrays ourselves. Use a Py_ssize_t capacity for safety. cdef Py_ssize_t capacity = 8192 # arbitrary - cdef np.int32_t size = 0 + cdef np.int64_t size = 0 cdef np.ndarray values = np.empty(capacity, dtype=dtype) for x in raw_X: @@ -79,4 +89,18 @@ def transform(raw_X, Py_ssize_t n_features, dtype, bint alternate_sign=1): indptr[len(indptr) - 1] = size indices_a = np.frombuffer(indices, dtype=np.int32) - return (indices_a, np.frombuffer(indptr, dtype=np.int32), values[:size]) + indptr_a = np.frombuffer(indptr, dtype=indices_np_dtype) + + if indptr[-1] > 2147483648: # = 2**31 + if sp_version < (0, 14): + raise ValueError(('sparse CSR array has {} non-zero ' + 'elements and requires 64 bit indexing, ' + ' which is unsupported with scipy {}. ' + 'Please upgrade to scipy >=0.14') + .format(indptr[-1], '.'.join(sp_version))) + # both indices and indptr have the same dtype in CSR arrays + indices_a = indices_a.astype(np.int64) + else: + indptr_a = indptr_a.astype(np.int32) + + return (indices_a, indptr_a, values[:size]) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 9c1c4776855b3..a1e0845abe9ac 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -30,6 +30,7 @@ from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS from ..utils.validation import check_is_fitted +from ..utils.fixes import sp_version __all__ = ['CountVectorizer', 'ENGLISH_STOP_WORDS', @@ -784,8 +785,8 @@ def _count_vocab(self, raw_documents, fixed_vocab): analyze = self.build_analyzer() j_indices = [] - # indptr stores indices into j_indices, which can be large - indptr = _make_int_array(dtype='l') + indptr = [] + values = _make_int_array() indptr.append(0) for doc in raw_documents: @@ -812,8 +813,20 @@ def _count_vocab(self, raw_documents, fixed_vocab): raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") - j_indices = np.asarray(j_indices, dtype=np.intc) - indptr = np.frombuffer(indptr, dtype=np.int_) + if indptr[-1] > 2147483648: # = 2**31 - 1 + if sp_version >= (0, 14): + indices_dtype = np.int64 + else: + raise ValueError(('sparse CSR array has {} non-zero ' + 'elements and requires 64 bit indexing, ' + ' which is unsupported with scipy {}. ' + 'Please upgrade to scipy >=0.14') + .format(indptr[-1], '.'.join(sp_version))) + + else: + indices_dtype = np.int32 + j_indices = np.asarray(j_indices, dtype=indices_dtype) + indptr = np.asarray(indptr, dtype=indices_dtype) values = np.frombuffer(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), @@ -964,12 +977,9 @@ def get_feature_names(self): key=itemgetter(1))] -def _make_int_array(dtype='i'): - """Construct an array.array of a type suitable for scipy.sparse indices. - - NOTE: long on Windows is only 32 bits - """ - return array.array(str(dtype)) +def _make_int_array(): + """Construct an array.array of a type suitable for scipy.sparse indices.""" + return array.array(str("i")) class TfidfTransformer(BaseEstimator, TransformerMixin): From b230cfdd2ada3bd1d20667d0dd16080435886c6f Mon Sep 17 00:00:00 2001 From: Roman Yurchak Date: Wed, 29 Nov 2017 16:27:52 +0100 Subject: [PATCH 6/6] Add what's new entry --- doc/whats_new/_contributors.rst | 2 ++ doc/whats_new/v0.20.rst | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index fd12247043e00..3dbcac5340585 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -151,3 +151,5 @@ .. _Arthur Mensch: https://amensch.fr .. _Joris Van den Bossche: https://github.com/jorisvandenbossche + +.. _Roman Yurchak: https://github.com/rth diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index a916a9bbd644a..9fe6c753ab27c 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -197,6 +197,15 @@ Neighbors warning when no neighbors are found for samples. :issue:`9655` by :user:`Andreas Bjerre-Nielsen `. +Feature extraction and preprocessing + +- Fixed a bug in :class:`feature_extraction.text.CountVectorizer`, + :class:`feature_extraction.text.TfidfVectorizer`, + :class:`feature_extraction.text.HashingVectorizer` to support 64 bit sparse + array indexing necessary to process large datasets with more than 2·10⁹ tokens + (words or n-grams). :issue:`9147` by :user:`Claes-Fredrik Mannby ` + and `Roman Yurchak`_. + API changes summary -------------------