scikit-learn · amueller · May 19, 2015 · Apr 30, 2015 · amueller · May 6, 2015
diff --git a/doc/developers/utilities.rst b/doc/developers/utilities.rst
@@ -43,9 +43,6 @@ should be used when applicable.
   be sliced or indexed using safe_index.  This is used to validate input for
   cross-validation.
 
-- :func:`warn_if_not_float`: Warn if input is not a floating-point value.
-  the input ``X`` is assumed to have ``X.dtype``.
-
 If your code relies on a random number generator, it should never use
 functions like ``numpy.random.random`` or ``numpy.random.normal``.  This
 approach can lead to repeatability issues in unit tests.  Instead, a

diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
@@ -27,6 +27,7 @@
 from ..utils import as_float_array
 from ..utils import gen_batches
 from ..utils.validation import check_is_fitted
+from ..utils.validation import FLOAT_DTYPES
 from ..utils.random import choice
 from ..externals.joblib import Parallel
 from ..externals.joblib import delayed
@@ -759,18 +760,14 @@ def _check_fit_data(self, X):
         return X
 
     def _check_test_data(self, X):
-        X = check_array(X, accept_sparse='csr')
+        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
+                        warn_on_dtype=True)
         n_samples, n_features = X.shape
         expected_n_features = self.cluster_centers_.shape[1]
         if not n_features == expected_n_features:
             raise ValueError("Incorrect number of features. "
                              "Got %d features, expected %d" % (
                                  n_features, expected_n_features))
-        if X.dtype.kind != 'f':
-            warnings.warn("Got data type %s, converted to float "
-                          "to avoid overflows" % X.dtype,
-                          RuntimeWarning, stacklevel=2)
-            X = X.astype(np.float)
 
         return X
 

diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
@@ -17,6 +17,7 @@
 from sklearn.utils.testing import assert_warns
 from sklearn.utils.testing import if_not_mac_os
 
+from sklearn.utils.validation import DataConversionWarning
 from sklearn.utils.extmath import row_norms
 from sklearn.metrics.cluster import v_measure_score
 from sklearn.cluster import KMeans, k_means
@@ -45,7 +46,7 @@ def test_kmeans_dtype():
     X = rnd.normal(size=(40, 2))
     X = (X * 10).astype(np.uint8)
     km = KMeans(n_init=1).fit(X)
-    pred_x = assert_warns(RuntimeWarning, km.predict, X)
+    pred_x = assert_warns(DataConversionWarning, km.predict, X)
     assert_array_equal(km.labels_, pred_x)
 
 

diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
@@ -22,6 +22,7 @@
 from ..externals import six
 
 from .sgd_fast import plain_sgd, average_sgd
+from ..utils.fixes import astype
 from ..utils.seq_dataset import ArrayDataset, CSRDataset
 from ..utils import compute_class_weight
 from .sgd_fast import Hinge
@@ -867,7 +868,7 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
                      n_iter, sample_weight,
                      coef_init, intercept_init):
         X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64)
-        y = y.astype(np.float64)
+        y = astype(y, np.float64, copy=False)
 
         n_samples, n_features = X.shape
 

diff --git a/sklearn/manifold/locally_linear.py b/sklearn/manifold/locally_linear.py
@@ -11,6 +11,7 @@
 from ..utils import check_random_state, check_array
 from ..utils.arpack import eigsh
 from ..utils.validation import check_is_fitted
+from ..utils.validation import FLOAT_DTYPES
 from ..neighbors import NearestNeighbors
 
 
@@ -38,14 +39,10 @@ def barycenter_weights(X, Z, reg=1e-3):
     -----
     See developers note for more information.
     """
-    X = np.asarray(X)
-    Z = np.asarray(Z)
+    X = check_array(X, dtype=FLOAT_DTYPES)
+    Z = check_array(Z, dtype=FLOAT_DTYPES, allow_nd=True)
 
     n_samples, n_neighbors = X.shape[0], Z.shape[1]
-    if X.dtype.kind == 'i':
-        X = X.astype(np.float)
-    if Z.dtype.kind == 'i':
-        Z = Z.astype(np.float)
     B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
     v = np.ones(n_neighbors, dtype=X.dtype)
 

diff --git a/sklearn/manifold/spectral_embedding_.py b/sklearn/manifold/spectral_embedding_.py
@@ -263,7 +263,9 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
         # problem.
         if not sparse.issparse(laplacian):
             warnings.warn("AMG works better for sparse matrices")
-        laplacian = laplacian.astype(np.float)  # lobpcg needs native floats
+        # lobpcg needs double precision floats
+        laplacian = check_array(laplacian, dtype=np.float64,
+                                accept_sparse=True)
         laplacian = _set_diag(laplacian, 1)
         ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
         M = ml.aspreconditioner()
@@ -276,7 +278,9 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
             raise ValueError
 
     elif eigen_solver == "lobpcg":
-        laplacian = laplacian.astype(np.float)  # lobpcg needs native floats
+        # lobpcg needs double precision floats
+        laplacian = check_array(laplacian, dtype=np.float64,
+                                accept_sparse=True)
         if n_nodes < 5 * n_components + 1:
             # see note above under arpack why lobpcg has problems with small
             # number of nodes
@@ -286,8 +290,6 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
             lambdas, diffusion_map = eigh(laplacian)
             embedding = diffusion_map.T[:n_components] * dd
         else:
-            # lobpcg needs native floats
-            laplacian = laplacian.astype(np.float)
             laplacian = _set_diag(laplacian, 1)
             # We increase the number of eigenvectors requested, as lobpcg
             # doesn't behave well in low dimension

diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
@@ -472,7 +472,8 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
             msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible."
             raise ValueError(msg % (X.shape[0], y.shape[0]))
 
-        # convert to float to support sample weight consistently
+        # label_binarize() returns arrays with dtype=np.int64.
+        # We convert it to np.float64 to support sample_weight consistently
         Y = Y.astype(np.float64)
         if sample_weight is not None:
             Y *= check_array(sample_weight).T
@@ -520,7 +521,8 @@ def fit(self, X, y, sample_weight=None):
         if Y.shape[1] == 1:
             Y = np.concatenate((1 - Y, Y), axis=1)
 
-        # convert to float to support sample weight consistently;
+        # LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
+        # We convert it to np.float64 to support sample_weight consistently;
         # this means we also don't have to cast X to floating point
         Y = Y.astype(np.float64)
         if sample_weight is not None:

diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -15,16 +15,14 @@
 from ..base import BaseEstimator, TransformerMixin
 from ..externals import six
 from ..utils import check_array
-from ..utils import warn_if_not_float
 from ..utils.extmath import row_norms
-from ..utils.fixes import (combinations_with_replacement as combinations_w_r,
-                           bincount)
-from ..utils.fixes import isclose
+from ..utils.fixes import combinations_with_replacement as combinations_w_r
 from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
                                       inplace_csr_row_normalize_l2)
 from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis,
                                  min_max_axis)
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, FLOAT_DTYPES
+
 
 zip = six.moves.zip
 map = six.moves.map
@@ -115,8 +113,9 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
     scaling using the ``Transformer`` API (e.g. as part of a preprocessing
     :class:`sklearn.pipeline.Pipeline`)
     """
-    X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False)
-    warn_if_not_float(X, estimator='The scale function')
+    X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False,
+                    warn_on_dtype=True, estimator='the scale function',
+                    dtype=FLOAT_DTYPES)
     if sparse.issparse(X):
         if with_mean:
             raise ValueError(
@@ -224,8 +223,8 @@ def fit(self, X, y=None):
             The data used to compute the per-feature minimum and maximum
             used for later scaling along the features axis.
         """
-        X = check_array(X, copy=self.copy, ensure_2d=False)
-        warn_if_not_float(X, estimator=self)
+        X = check_array(X, copy=self.copy, ensure_2d=False, warn_on_dtype=True,
+                        estimator=self, dtype=FLOAT_DTYPES)
         feature_range = self.feature_range
         if feature_range[0] >= feature_range[1]:
             raise ValueError("Minimum of desired feature range must be smaller"
@@ -346,9 +345,8 @@ def fit(self, X, y=None):
             used for later scaling along the features axis.
         """
         X = check_array(X, accept_sparse='csr', copy=self.copy,
-                        ensure_2d=False)
-        if warn_if_not_float(X, estimator=self):
-            X = X.astype(np.float)
+                        ensure_2d=False, warn_on_dtype=True,
+                        estimator=self, dtype=FLOAT_DTYPES)
         if sparse.issparse(X):
             if self.with_mean:
                 raise ValueError(
@@ -379,9 +377,9 @@ def transform(self, X, y=None, copy=None):
         check_is_fitted(self, 'std_')
 
         copy = copy if copy is not None else self.copy
-        X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False)
-        if warn_if_not_float(X, estimator=self):
-            X = X.astype(np.float)
+        X = check_array(X, accept_sparse='csr', copy=copy,
+                        ensure_2d=False, warn_on_dtype=True,
+                        estimator=self, dtype=FLOAT_DTYPES)
         if sparse.issparse(X):
             if self.with_mean:
                 raise ValueError(
@@ -600,8 +598,8 @@ def normalize(X, norm='l2', axis=1, copy=True):
     else:
         raise ValueError("'%d' is not a supported axis" % axis)
 
-    X = check_array(X, sparse_format, copy=copy)
-    warn_if_not_float(X, 'The normalize function')
+    X = check_array(X, sparse_format, copy=copy, warn_on_dtype=True,
+                    estimator='the normalize function', dtype=FLOAT_DTYPES)
     if axis == 0:
         X = X.T
 

diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -29,6 +29,7 @@
 from sklearn.preprocessing.data import MinMaxScaler
 from sklearn.preprocessing.data import add_dummy_feature
 from sklearn.preprocessing.data import PolynomialFeatures
+from sklearn.utils.validation import DataConversionWarning
 
 from sklearn import datasets
 
@@ -499,12 +500,12 @@ def test_warning_scaling_integers():
     X = np.array([[1, 2, 0],
                   [0, 0, 0]], dtype=np.uint8)
 
-    w = "assumes floating point values as input, got uint8"
+    w = "Data with input dtype uint8 was converted to float64"
 
     clean_warning_registry()
-    assert_warns_message(UserWarning, w, scale, X)
-    assert_warns_message(UserWarning, w, StandardScaler().fit, X)
-    assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
+    assert_warns_message(DataConversionWarning, w, scale, X)
+    assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X)
+    assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)
 
 
 def test_normalizer_l1():

diff --git a/sklearn/utils/__init__.py b/sklearn/utils/__init__.py
@@ -9,7 +9,7 @@
 
 from .murmurhash import murmurhash3_32
 from .validation import (as_float_array,
-                         assert_all_finite, warn_if_not_float,
+                         assert_all_finite,
                          check_random_state, column_or_1d, check_array,
                          check_consistent_length, check_X_y, indexable,
                          check_symmetric, DataConversionWarning)
@@ -19,7 +19,6 @@
 
 __all__ = ["murmurhash3_32", "as_float_array",
            "assert_all_finite", "check_array",
-           "warn_if_not_float",
            "check_random_state",
            "compute_class_weight", "compute_sample_weight",
            "column_or_1d", "safe_indexing",

diff --git a/sklearn/utils/graph.py b/sklearn/utils/graph.py
@@ -13,6 +13,7 @@
 import numpy as np
 from scipy import sparse
 
+from .validation import check_array
 from .graph_shortest_path import graph_shortest_path
 
 
@@ -113,7 +114,7 @@ def graph_laplacian(csgraph, normed=False, return_diag=False):
 
     if normed and (np.issubdtype(csgraph.dtype, np.int)
                    or np.issubdtype(csgraph.dtype, np.uint)):
-        csgraph = csgraph.astype(np.float)
+        csgraph = check_array(csgraph, dtype=np.float64, accept_sparse=True)
 
     if sparse.isspmatrix(csgraph):
         return _laplacian_sparse(csgraph, normed=normed,

diff --git a/sklearn/utils/random.py b/sklearn/utils/random.py
@@ -8,7 +8,7 @@
 import array
 
 from sklearn.utils import check_random_state
-
+from sklearn.utils.fixes import astype
 from ._random import sample_without_replacement
 
 __all__ = ['sample_without_replacement', 'choice']
@@ -238,7 +238,7 @@ def random_choice_csc(n_samples, classes, class_probability=None,
         if classes[j].dtype.kind != 'i':
             raise ValueError("class dtype %s is not supported" %
                              classes[j].dtype)
-        classes[j] = classes[j].astype(int)
+        classes[j] = astype(classes[j], np.int64, copy=False)
 
         # use uniform distribution if no class_probability is given
         if class_probability is None:

diff --git a/sklearn/utils/testing.py b/sklearn/utils/testing.py
@@ -212,7 +212,7 @@ def assert_warns_message(warning_class, message, func, *args, **kw):
             raise AssertionError("No warning raised when calling %s"
                                  % func.__name__)
 
-        found = [warning.category is warning_class for warning in w]
+        found = [issubclass(warning.category, warning_class) for warning in w]
         if not any(found):
             raise AssertionError("No warning raised for %s with class "
                                  "%s"
@@ -235,8 +235,8 @@ def assert_warns_message(warning_class, message, func, *args, **kw):
 
         if not message_found:
             raise AssertionError("Did not receive the message you expected "
-                                 "('%s') for <%s>."
-                                 % (message, func.__name__))
+                                 "('%s') for <%s>, got: '%s'"
+                                 % (message, func.__name__, msg))
 
     return result