revert aggressive input validation changes

amueller · amueller · commit f1e679cbc9ed · 2017-05-15T17:27:47.000-04:00
diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -10,7 +10,7 @@
 
 from .base import BaseEstimator, ClassifierMixin, RegressorMixin
 from .utils import check_random_state
-from .utils.validation import check_array, check_X_y
+from .utils.validation import check_array
 from .utils.validation import check_consistent_length
 from .utils.validation import check_is_fitted
 from .utils.random import random_choice_csc
@@ -117,7 +117,6 @@ def fit(self, X, y, sample_weight=None):
 
         self.sparse_output_ = sp.issparse(y)
 
-        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
         check_consistent_length(X, y)
 
         if not self.sparse_output_:
@@ -398,8 +397,8 @@ def fit(self, X, y, sample_weight=None):
                              "'mean', 'median', 'quantile' or 'constant'"
                              % self.strategy)
 
-        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
-                         multi_output=True)
+        y = check_array(y, ensure_2d=False)
+
         if len(y) == 0:
             raise ValueError("y must not be empty.")
 
diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py
@@ -29,8 +29,9 @@
 from ..preprocessing import normalize
 from .hashing import FeatureHasher
 from .stop_words import ENGLISH_STOP_WORDS
+from ..utils import deprecated
 from ..utils.fixes import frombuffer_empty, bincount
-from ..utils.validation import check_is_fitted, check_array
+from ..utils.validation import check_is_fitted
 
 __all__ = ['CountVectorizer',
            'ENGLISH_STOP_WORDS',
@@ -158,7 +159,8 @@ def _char_wb_ngrams(self, text_document):
         """Whitespace sensitive char-n-gram tokenization.
 
         Tokenize text_document into a sequence of character n-grams
-        excluding any whitespace (operating only inside word boundaries)"""
+        operating only inside word boundaries. n-grams at the edges
+        of words are padded with space."""
         # normalize white spaces
         text_document = self._white_spaces.sub(" ", text_document)
 
@@ -353,7 +355,7 @@ class HashingVectorizer(BaseEstimator, VectorizerMixin):
     analyzer : string, {'word', 'char', 'char_wb'} or callable
         Whether the feature should be made of word or character n-grams.
         Option 'char_wb' creates character n-grams only from text inside
-        word boundaries.
+        word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
         out of the raw, unprocessed input.
@@ -552,7 +554,7 @@ class CountVectorizer(BaseEstimator, VectorizerMixin):
     analyzer : string, {'word', 'char', 'char_wb'} or callable
         Whether the feature should be made of word or character n-grams.
         Option 'char_wb' creates character n-grams only from text inside
-        word boundaries.
+        word boundaries; n-grams at the edges of words are padded with space.
 
         If a callable is passed it is used to extract the sequence of features
         out of the raw, unprocessed input.
@@ -1022,8 +1024,7 @@ def fit(self, X, y=None):
             a matrix of term/token counts
         """
         if not sp.issparse(X):
-            X = sp.csc_matrix(X, dtype=np.float64)
-        X = check_array(X, accept_sparse=["csc", "csr"])
+            X = sp.csc_matrix(X)
         if self.use_idf:
             n_samples, n_features = X.shape
             df = _document_frequency(X)
@@ -1056,19 +1057,18 @@ def transform(self, X, copy=True):
         -------
         vectors : sparse matrix, [n_samples, n_features]
         """
-        X = check_array(X, accept_sparse=["csr"], copy=copy,
-                        dtype=[np.float64, np.float32])
+        if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
+            # preserve float family dtype
+            X = sp.csr_matrix(X, copy=copy)
+        else:
+            # convert counts or binary occurrences to floats
+            X = sp.csr_matrix(X, dtype=np.float64, copy=copy)
 
         n_samples, n_features = X.shape
 
         if self.sublinear_tf:
-            if sp.issparse(X):
-                np.log(X.data, X.data)
-                X.data += 1
-            else:
-                mask = X != 0
-                X[mask] = np.log(X[mask])
-                X[mask] += 1
+            np.log(X.data, X.data)
+            X.data += 1
 
         if self.use_idf:
             check_is_fitted(self, '_idf_diag', 'idf vector is not fitted')