|
29 | 29 | from ..preprocessing import normalize
|
30 | 30 | from .hashing import FeatureHasher
|
31 | 31 | from .stop_words import ENGLISH_STOP_WORDS
|
| 32 | +from ..utils import deprecated |
32 | 33 | from ..utils.fixes import frombuffer_empty, bincount
|
33 |
| -from ..utils.validation import check_is_fitted, check_array |
| 34 | +from ..utils.validation import check_is_fitted |
34 | 35 |
|
35 | 36 | __all__ = ['CountVectorizer',
|
36 | 37 | 'ENGLISH_STOP_WORDS',
|
@@ -158,7 +159,8 @@ def _char_wb_ngrams(self, text_document):
|
158 | 159 | """Whitespace sensitive char-n-gram tokenization.
|
159 | 160 |
|
160 | 161 | Tokenize text_document into a sequence of character n-grams
|
161 |
| - excluding any whitespace (operating only inside word boundaries)""" |
| 162 | + operating only inside word boundaries. n-grams at the edges |
| 163 | + of words are padded with space.""" |
162 | 164 | # normalize white spaces
|
163 | 165 | text_document = self._white_spaces.sub(" ", text_document)
|
164 | 166 |
|
@@ -353,7 +355,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
|
353 | 355 | analyzer : string, {'word', 'char', 'char_wb'} or callable
|
354 | 356 | Whether the feature should be made of word or character n-grams.
|
355 | 357 | Option 'char_wb' creates character n-grams only from text inside
|
356 |
| - word boundaries. |
| 358 | + word boundaries; n-grams at the edges of words are padded with space. |
357 | 359 |
|
358 | 360 | If a callable is passed it is used to extract the sequence of features
|
359 | 361 | out of the raw, unprocessed input.
|
@@ -552,7 +554,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
|
552 | 554 | analyzer : string, {'word', 'char', 'char_wb'} or callable
|
553 | 555 | Whether the feature should be made of word or character n-grams.
|
554 | 556 | Option 'char_wb' creates character n-grams only from text inside
|
555 |
| - word boundaries. |
| 557 | + word boundaries; n-grams at the edges of words are padded with space. |
556 | 558 |
|
557 | 559 | If a callable is passed it is used to extract the sequence of features
|
558 | 560 | out of the raw, unprocessed input.
|
@@ -1022,8 +1024,7 @@ def fit(self, X, y=None):
|
1022 | 1024 | a matrix of term/token counts
|
1023 | 1025 | """
|
1024 | 1026 | if not sp.issparse(X):
|
1025 |
| - X = sp.csc_matrix(X, dtype=np.float64) |
1026 |
| - X = check_array(X, accept_sparse=["csc", "csr"]) |
| 1027 | + X = sp.csc_matrix(X) |
1027 | 1028 | if self.use_idf:
|
1028 | 1029 | n_samples, n_features = X.shape
|
1029 | 1030 | df = _document_frequency(X)
|
@@ -1056,19 +1057,18 @@ def transform(self, X, copy=True):
|
1056 | 1057 | -------
|
1057 | 1058 | vectors : sparse matrix, [n_samples, n_features]
|
1058 | 1059 | """
|
1059 |
| - X = check_array(X, accept_sparse=["csr"], copy=copy, |
1060 |
| - dtype=[np.float64, np.float32]) |
| 1060 | + if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float): |
| 1061 | + # preserve float family dtype |
| 1062 | + X = sp.csr_matrix(X, copy=copy) |
| 1063 | + else: |
| 1064 | + # convert counts or binary occurrences to floats |
| 1065 | + X = sp.csr_matrix(X, dtype=np.float64, copy=copy) |
1061 | 1066 |
|
1062 | 1067 | n_samples, n_features = X.shape
|
1063 | 1068 |
|
1064 | 1069 | if self.sublinear_tf:
|
1065 |
| - if sp.issparse(X): |
1066 |
| - np.log(X.data, X.data) |
1067 |
| - X.data += 1 |
1068 |
| - else: |
1069 |
| - mask = X != 0 |
1070 |
| - X[mask] = np.log(X[mask]) |
1071 |
| - X[mask] += 1 |
| 1070 | + np.log(X.data, X.data) |
| 1071 | + X.data += 1 |
1072 | 1072 |
|
1073 | 1073 | if self.use_idf:
|
1074 | 1074 | check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')
|
|
0 commit comments