Skip to content

[MRG] Support new scipy sparse array indices, which can now be > 2^31 (Was pull request #6194) #6473

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 33 additions & 5 deletions sklearn/feature_extraction/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from .hashing import FeatureHasher
from .stop_words import ENGLISH_STOP_WORDS
from ..utils import deprecated
from ..utils.fixes import frombuffer_empty, bincount
from ..utils.fixes import frombuffer_empty, bincount, sp_version
from ..utils.validation import check_is_fitted

__all__ = ['CountVectorizer',
Expand Down Expand Up @@ -744,8 +744,19 @@ def _count_vocab(self, raw_documents, fixed_vocab):
vocabulary.default_factory = vocabulary.__len__

analyze = self.build_analyzer()
j_indices = _make_int_array()
indptr = _make_int_array()
if sp_version >= (0, 14):
# We can use 64-bit indices
# NOTE: long on Windows is only 32 bits
# j_indices stores feature indices, likely to be < 2^31
j_indices = _make_long_array()
# indptr stores indices into j_indices, which can be large
indptr = _make_long_array()
else:
# Sparse arrays only support 32-bit integers
# j_indices stores feature indices, likely to be < 2^31
j_indices = _make_int_array()
# indptr stores indices into j_indices, which can be large
indptr = _make_int_array()
indptr.append(0)
for doc in raw_documents:
for feature in analyze(doc):
Expand All @@ -763,8 +774,16 @@ def _count_vocab(self, raw_documents, fixed_vocab):
raise ValueError("empty vocabulary; perhaps the documents only"
" contain stop words")

j_indices = frombuffer_empty(j_indices, dtype=np.intc)
indptr = np.frombuffer(indptr, dtype=np.intc)
if sp_version >= (0, 14):
# We can use 64-bit indices
# int_ == "l" (long)
# NOTE: long on Windows is only 32 bits
j_indices = frombuffer_empty(j_indices, dtype=np.int_)
indptr = np.frombuffer(indptr, dtype=np.int_)
else:
# Sparse arrays only support 32-bit integers
j_indices = frombuffer_empty(j_indices, dtype=np.intc)
indptr = np.frombuffer(indptr, dtype=np.intc)
values = np.ones(len(j_indices))

X = sp.csr_matrix((values, j_indices, indptr),
Expand Down Expand Up @@ -909,6 +928,15 @@ def _make_int_array():
"""Construct an array.array of a type suitable for scipy.sparse indices."""
return array.array(str("i"))

def _make_long_array():
"""Construct an array.array of a type suitable for large scipy.sparse indices.

scipy 0.14 and later can construct sparse matrices with 64 bit integer indices.

NOTE: long on Windows is only 32 bits
"""
return array.array(str("l"))


class TfidfTransformer(BaseEstimator, TransformerMixin):
"""Transform a count matrix to a normalized tf or tf-idf representation
Expand Down