Skip to content

Commit f1e679c

Browse files
committed
revert aggressive input validation changes
1 parent 6ee218d commit f1e679c

File tree

2 files changed

+18
-19
lines changed

2 files changed

+18
-19
lines changed

sklearn/dummy.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from .base import BaseEstimator, ClassifierMixin, RegressorMixin
1212
from .utils import check_random_state
13-
from .utils.validation import check_array, check_X_y
13+
from .utils.validation import check_array
1414
from .utils.validation import check_consistent_length
1515
from .utils.validation import check_is_fitted
1616
from .utils.random import random_choice_csc
@@ -117,7 +117,6 @@ def fit(self, X, y, sample_weight=None):
117117

118118
self.sparse_output_ = sp.issparse(y)
119119

120-
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
121120
check_consistent_length(X, y)
122121

123122
if not self.sparse_output_:
@@ -398,8 +397,8 @@ def fit(self, X, y, sample_weight=None):
398397
"'mean', 'median', 'quantile' or 'constant'"
399398
% self.strategy)
400399

401-
X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
402-
multi_output=True)
400+
y = check_array(y, ensure_2d=False)
401+
403402
if len(y) == 0:
404403
raise ValueError("y must not be empty.")
405404

sklearn/feature_extraction/text.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,9 @@
2929
from ..preprocessing import normalize
3030
from .hashing import FeatureHasher
3131
from .stop_words import ENGLISH_STOP_WORDS
32+
from ..utils import deprecated
3233
from ..utils.fixes import frombuffer_empty, bincount
33-
from ..utils.validation import check_is_fitted, check_array
34+
from ..utils.validation import check_is_fitted
3435

3536
__all__ = ['CountVectorizer',
3637
'ENGLISH_STOP_WORDS',
@@ -158,7 +159,8 @@ def _char_wb_ngrams(self, text_document):
158159
"""Whitespace sensitive char-n-gram tokenization.
159160
160161
Tokenize text_document into a sequence of character n-grams
161-
excluding any whitespace (operating only inside word boundaries)"""
162+
operating only inside word boundaries. n-grams at the edges
163+
of words are padded with space."""
162164
# normalize white spaces
163165
text_document = self._white_spaces.sub(" ", text_document)
164166

@@ -353,7 +355,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
353355
analyzer : string, {'word', 'char', 'char_wb'} or callable
354356
Whether the feature should be made of word or character n-grams.
355357
Option 'char_wb' creates character n-grams only from text inside
356-
word boundaries.
358+
word boundaries; n-grams at the edges of words are padded with space.
357359
358360
If a callable is passed it is used to extract the sequence of features
359361
out of the raw, unprocessed input.
@@ -552,7 +554,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
552554
analyzer : string, {'word', 'char', 'char_wb'} or callable
553555
Whether the feature should be made of word or character n-grams.
554556
Option 'char_wb' creates character n-grams only from text inside
555-
word boundaries.
557+
word boundaries; n-grams at the edges of words are padded with space.
556558
557559
If a callable is passed it is used to extract the sequence of features
558560
out of the raw, unprocessed input.
@@ -1022,8 +1024,7 @@ def fit(self, X, y=None):
10221024
a matrix of term/token counts
10231025
"""
10241026
if not sp.issparse(X):
1025-
X = sp.csc_matrix(X, dtype=np.float64)
1026-
X = check_array(X, accept_sparse=["csc", "csr"])
1027+
X = sp.csc_matrix(X)
10271028
if self.use_idf:
10281029
n_samples, n_features = X.shape
10291030
df = _document_frequency(X)
@@ -1056,19 +1057,18 @@ def transform(self, X, copy=True):
10561057
-------
10571058
vectors : sparse matrix, [n_samples, n_features]
10581059
"""
1059-
X = check_array(X, accept_sparse=["csr"], copy=copy,
1060-
dtype=[np.float64, np.float32])
1060+
if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
1061+
# preserve float family dtype
1062+
X = sp.csr_matrix(X, copy=copy)
1063+
else:
1064+
# convert counts or binary occurrences to floats
1065+
X = sp.csr_matrix(X, dtype=np.float64, copy=copy)
10611066

10621067
n_samples, n_features = X.shape
10631068

10641069
if self.sublinear_tf:
1065-
if sp.issparse(X):
1066-
np.log(X.data, X.data)
1067-
X.data += 1
1068-
else:
1069-
mask = X != 0
1070-
X[mask] = np.log(X[mask])
1071-
X[mask] += 1
1070+
np.log(X.data, X.data)
1071+
X.data += 1
10721072

10731073
if self.use_idf:
10741074
check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')

0 commit comments

Comments
 (0)