Skip to content

[MRG+1] Astype fix #4645

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 19, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions doc/developers/utilities.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ should be used when applicable.
be sliced or indexed using safe_index. This is used to validate input for
cross-validation.

- :func:`warn_if_not_float`: Warn if input is not a floating-point value.
the input ``X`` is assumed to have ``X.dtype``.

If your code relies on a random number generator, it should never use
functions like ``numpy.random.random`` or ``numpy.random.normal``. This
approach can lead to repeatability issues in unit tests. Instead, a
Expand Down
9 changes: 3 additions & 6 deletions sklearn/cluster/k_means_.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from ..utils import as_float_array
from ..utils import gen_batches
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
from ..utils.random import choice
from ..externals.joblib import Parallel
from ..externals.joblib import delayed
Expand Down Expand Up @@ -759,18 +760,14 @@ def _check_fit_data(self, X):
return X

def _check_test_data(self, X):
X = check_array(X, accept_sparse='csr')
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
warn_on_dtype=True)
n_samples, n_features = X.shape
expected_n_features = self.cluster_centers_.shape[1]
if not n_features == expected_n_features:
raise ValueError("Incorrect number of features. "
"Got %d features, expected %d" % (
n_features, expected_n_features))
if X.dtype.kind != 'f':
warnings.warn("Got data type %s, converted to float "
"to avoid overflows" % X.dtype,
RuntimeWarning, stacklevel=2)
X = X.astype(np.float)

return X

Expand Down
3 changes: 2 additions & 1 deletion sklearn/cluster/tests/test_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import if_not_mac_os

from sklearn.utils.validation import DataConversionWarning
from sklearn.utils.extmath import row_norms
from sklearn.metrics.cluster import v_measure_score
from sklearn.cluster import KMeans, k_means
Expand Down Expand Up @@ -45,7 +46,7 @@ def test_kmeans_dtype():
X = rnd.normal(size=(40, 2))
X = (X * 10).astype(np.uint8)
km = KMeans(n_init=1).fit(X)
pred_x = assert_warns(RuntimeWarning, km.predict, X)
pred_x = assert_warns(DataConversionWarning, km.predict, X)
assert_array_equal(km.labels_, pred_x)


Expand Down
3 changes: 2 additions & 1 deletion sklearn/linear_model/stochastic_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ..externals import six

from .sgd_fast import plain_sgd, average_sgd
from ..utils.fixes import astype
from ..utils.seq_dataset import ArrayDataset, CSRDataset
from ..utils import compute_class_weight
from .sgd_fast import Hinge
Expand Down Expand Up @@ -867,7 +868,7 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
n_iter, sample_weight,
coef_init, intercept_init):
X, y = check_X_y(X, y, "csr", copy=False, order='C', dtype=np.float64)
y = y.astype(np.float64)
y = astype(y, np.float64, copy=False)

n_samples, n_features = X.shape

Expand Down
9 changes: 3 additions & 6 deletions sklearn/manifold/locally_linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ..utils import check_random_state, check_array
from ..utils.arpack import eigsh
from ..utils.validation import check_is_fitted
from ..utils.validation import FLOAT_DTYPES
from ..neighbors import NearestNeighbors


Expand Down Expand Up @@ -38,14 +39,10 @@ def barycenter_weights(X, Z, reg=1e-3):
-----
See developers note for more information.
"""
X = np.asarray(X)
Z = np.asarray(Z)
X = check_array(X, dtype=FLOAT_DTYPES)
Z = check_array(Z, dtype=FLOAT_DTYPES, allow_nd=True)

n_samples, n_neighbors = X.shape[0], Z.shape[1]
if X.dtype.kind == 'i':
X = X.astype(np.float)
if Z.dtype.kind == 'i':
Z = Z.astype(np.float)
B = np.empty((n_samples, n_neighbors), dtype=X.dtype)
v = np.ones(n_neighbors, dtype=X.dtype)

Expand Down
10 changes: 6 additions & 4 deletions sklearn/manifold/spectral_embedding_.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,9 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
# problem.
if not sparse.issparse(laplacian):
warnings.warn("AMG works better for sparse matrices")
laplacian = laplacian.astype(np.float) # lobpcg needs native floats
# lobpcg needs double precision floats
laplacian = check_array(laplacian, dtype=np.float64,
accept_sparse=True)
laplacian = _set_diag(laplacian, 1)
ml = smoothed_aggregation_solver(check_array(laplacian, 'csr'))
M = ml.aspreconditioner()
Expand All @@ -276,7 +278,9 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
raise ValueError

elif eigen_solver == "lobpcg":
laplacian = laplacian.astype(np.float) # lobpcg needs native floats
# lobpcg needs double precision floats
laplacian = check_array(laplacian, dtype=np.float64,
accept_sparse=True)
if n_nodes < 5 * n_components + 1:
# see note above under arpack why lobpcg has problems with small
# number of nodes
Expand All @@ -286,8 +290,6 @@ def spectral_embedding(adjacency, n_components=8, eigen_solver=None,
lambdas, diffusion_map = eigh(laplacian)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here? that is odd...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can actually remove that check as it's already done before entering the if / else blocks.

embedding = diffusion_map.T[:n_components] * dd
else:
# lobpcg needs native floats
laplacian = laplacian.astype(np.float)
laplacian = _set_diag(laplacian, 1)
# We increase the number of eigenvectors requested, as lobpcg
# doesn't behave well in low dimension
Expand Down
6 changes: 4 additions & 2 deletions sklearn/naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,8 @@ def partial_fit(self, X, y, classes=None, sample_weight=None):
msg = "X.shape[0]=%d and y.shape[0]=%d are incompatible."
raise ValueError(msg % (X.shape[0], y.shape[0]))

# convert to float to support sample weight consistently
# label_binarize() returns arrays with dtype=np.int64.
# We convert it to np.float64 to support sample_weight consistently
Y = Y.astype(np.float64)
if sample_weight is not None:
Y *= check_array(sample_weight).T
Expand Down Expand Up @@ -520,7 +521,8 @@ def fit(self, X, y, sample_weight=None):
if Y.shape[1] == 1:
Y = np.concatenate((1 - Y, Y), axis=1)

# convert to float to support sample weight consistently;
# LabelBinarizer().fit_transform() returns arrays with dtype=np.int64.
# We convert it to np.float64 to support sample_weight consistently;
# this means we also don't have to cast X to floating point
Y = Y.astype(np.float64)
if sample_weight is not None:
Expand Down
32 changes: 15 additions & 17 deletions sklearn/preprocessing/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,14 @@
from ..base import BaseEstimator, TransformerMixin
from ..externals import six
from ..utils import check_array
from ..utils import warn_if_not_float
from ..utils.extmath import row_norms
from ..utils.fixes import (combinations_with_replacement as combinations_w_r,
bincount)
from ..utils.fixes import isclose
from ..utils.fixes import combinations_with_replacement as combinations_w_r
from ..utils.sparsefuncs_fast import (inplace_csr_row_normalize_l1,
inplace_csr_row_normalize_l2)
from ..utils.sparsefuncs import (inplace_column_scale, mean_variance_axis,
min_max_axis)
from ..utils.validation import check_is_fitted
from ..utils.validation import check_is_fitted, FLOAT_DTYPES


zip = six.moves.zip
map = six.moves.map
Expand Down Expand Up @@ -115,8 +113,9 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
scaling using the ``Transformer`` API (e.g. as part of a preprocessing
:class:`sklearn.pipeline.Pipeline`)
"""
X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False)
warn_if_not_float(X, estimator='The scale function')
X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False,
warn_on_dtype=True, estimator='the scale function',
dtype=FLOAT_DTYPES)
if sparse.issparse(X):
if with_mean:
raise ValueError(
Expand Down Expand Up @@ -224,8 +223,8 @@ def fit(self, X, y=None):
The data used to compute the per-feature minimum and maximum
used for later scaling along the features axis.
"""
X = check_array(X, copy=self.copy, ensure_2d=False)
warn_if_not_float(X, estimator=self)
X = check_array(X, copy=self.copy, ensure_2d=False, warn_on_dtype=True,
estimator=self, dtype=FLOAT_DTYPES)
feature_range = self.feature_range
if feature_range[0] >= feature_range[1]:
raise ValueError("Minimum of desired feature range must be smaller"
Expand Down Expand Up @@ -346,9 +345,8 @@ def fit(self, X, y=None):
used for later scaling along the features axis.
"""
X = check_array(X, accept_sparse='csr', copy=self.copy,
ensure_2d=False)
if warn_if_not_float(X, estimator=self):
X = X.astype(np.float)
ensure_2d=False, warn_on_dtype=True,
estimator=self, dtype=FLOAT_DTYPES)
if sparse.issparse(X):
if self.with_mean:
raise ValueError(
Expand Down Expand Up @@ -379,9 +377,9 @@ def transform(self, X, y=None, copy=None):
check_is_fitted(self, 'std_')

copy = copy if copy is not None else self.copy
X = check_array(X, accept_sparse='csr', copy=copy, ensure_2d=False)
if warn_if_not_float(X, estimator=self):
X = X.astype(np.float)
X = check_array(X, accept_sparse='csr', copy=copy,
ensure_2d=False, warn_on_dtype=True,
estimator=self, dtype=FLOAT_DTYPES)
if sparse.issparse(X):
if self.with_mean:
raise ValueError(
Expand Down Expand Up @@ -600,8 +598,8 @@ def normalize(X, norm='l2', axis=1, copy=True):
else:
raise ValueError("'%d' is not a supported axis" % axis)

X = check_array(X, sparse_format, copy=copy)
warn_if_not_float(X, 'The normalize function')
X = check_array(X, sparse_format, copy=copy, warn_on_dtype=True,
estimator='the normalize function', dtype=FLOAT_DTYPES)
if axis == 0:
X = X.T

Expand Down
9 changes: 5 additions & 4 deletions sklearn/preprocessing/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from sklearn.preprocessing.data import MinMaxScaler
from sklearn.preprocessing.data import add_dummy_feature
from sklearn.preprocessing.data import PolynomialFeatures
from sklearn.utils.validation import DataConversionWarning

from sklearn import datasets

Expand Down Expand Up @@ -499,12 +500,12 @@ def test_warning_scaling_integers():
X = np.array([[1, 2, 0],
[0, 0, 0]], dtype=np.uint8)

w = "assumes floating point values as input, got uint8"
w = "Data with input dtype uint8 was converted to float64"

clean_warning_registry()
assert_warns_message(UserWarning, w, scale, X)
assert_warns_message(UserWarning, w, StandardScaler().fit, X)
assert_warns_message(UserWarning, w, MinMaxScaler().fit, X)
assert_warns_message(DataConversionWarning, w, scale, X)
assert_warns_message(DataConversionWarning, w, StandardScaler().fit, X)
assert_warns_message(DataConversionWarning, w, MinMaxScaler().fit, X)


def test_normalizer_l1():
Expand Down
3 changes: 1 addition & 2 deletions sklearn/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from .murmurhash import murmurhash3_32
from .validation import (as_float_array,
assert_all_finite, warn_if_not_float,
assert_all_finite,
check_random_state, column_or_1d, check_array,
check_consistent_length, check_X_y, indexable,
check_symmetric, DataConversionWarning)
Expand All @@ -19,7 +19,6 @@

__all__ = ["murmurhash3_32", "as_float_array",
"assert_all_finite", "check_array",
"warn_if_not_float",
"check_random_state",
"compute_class_weight", "compute_sample_weight",
"column_or_1d", "safe_indexing",
Expand Down
3 changes: 2 additions & 1 deletion sklearn/utils/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np
from scipy import sparse

from .validation import check_array
from .graph_shortest_path import graph_shortest_path


Expand Down Expand Up @@ -113,7 +114,7 @@ def graph_laplacian(csgraph, normed=False, return_diag=False):

if normed and (np.issubdtype(csgraph.dtype, np.int)
or np.issubdtype(csgraph.dtype, np.uint)):
csgraph = csgraph.astype(np.float)
csgraph = check_array(csgraph, dtype=np.float64, accept_sparse=True)

if sparse.isspmatrix(csgraph):
return _laplacian_sparse(csgraph, normed=normed,
Expand Down
4 changes: 2 additions & 2 deletions sklearn/utils/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import array

from sklearn.utils import check_random_state

from sklearn.utils.fixes import astype
from ._random import sample_without_replacement

__all__ = ['sample_without_replacement', 'choice']
Expand Down Expand Up @@ -238,7 +238,7 @@ def random_choice_csc(n_samples, classes, class_probability=None,
if classes[j].dtype.kind != 'i':
raise ValueError("class dtype %s is not supported" %
classes[j].dtype)
classes[j] = classes[j].astype(int)
classes[j] = astype(classes[j], np.int64, copy=False)

# use uniform distribution if no class_probability is given
if class_probability is None:
Expand Down
6 changes: 3 additions & 3 deletions sklearn/utils/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def assert_warns_message(warning_class, message, func, *args, **kw):
raise AssertionError("No warning raised when calling %s"
% func.__name__)

found = [warning.category is warning_class for warning in w]
found = [issubclass(warning.category, warning_class) for warning in w]
if not any(found):
raise AssertionError("No warning raised for %s with class "
"%s"
Expand All @@ -235,8 +235,8 @@ def assert_warns_message(warning_class, message, func, *args, **kw):

if not message_found:
raise AssertionError("Did not receive the message you expected "
"('%s') for <%s>."
% (message, func.__name__))
"('%s') for <%s>, got: '%s'"
% (message, func.__name__, msg))

return result

Expand Down
Loading