scikit-learn · maxwell-aladago · Aug 15, 2019 · Aug 21, 2019 · Aug 21, 2019 · Aug 21, 2019
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -23,6 +23,7 @@
 from .preprocessing import label_binarize, LabelBinarizer
 from .utils import check_X_y, check_array, indexable, column_or_1d
 from .utils.validation import check_is_fitted, check_consistent_length
+from .utils.validation import _check_sample_weight
 from .isotonic import IsotonicRegression
 from .svm import LinearSVC
 from .model_selection import check_cv
@@ -155,6 +156,9 @@ def fit(self, X, y, sample_weight=None):
         else:
             base_estimator = self.base_estimator
 
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X)
+
         if self.cv == "prefit":
             calibrated_classifier = _CalibratedClassifier(
                 base_estimator, method=self.method)
@@ -172,12 +176,8 @@ def fit(self, X, y, sample_weight=None):
                 warnings.warn("%s does not support sample_weight. Samples"
                               " weights are only used for the calibration"
                               " itself." % estimator_name)
-                sample_weight = check_array(sample_weight, ensure_2d=False)
                 base_estimator_sample_weight = None
             else:
-                if sample_weight is not None:
-                    sample_weight = check_array(sample_weight, ensure_2d=False)
-                    check_consistent_length(y, sample_weight)
                 base_estimator_sample_weight = sample_weight
             for train, test in cv.split(X, y):
                 this_estimator = clone(base_estimator)

diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -60,7 +60,7 @@ class calls the ``fit`` method of each sub-estimator on random samples
 from .base import BaseEnsemble, _partition_estimators
 from ..utils.fixes import parallel_helper, _joblib_parallel_args
 from ..utils.multiclass import check_classification_targets
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _check_sample_weight
 
 
 __all__ = ["RandomForestClassifier",
@@ -243,7 +243,7 @@ def fit(self, X, y, sample_weight=None):
         X = check_array(X, accept_sparse="csc", dtype=DTYPE)
         y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
         if sample_weight is not None:
-            sample_weight = check_array(sample_weight, ensure_2d=False)
+            sample_weight = _check_sample_weight(sample_weight, X)
         if issparse(X):
             # Pre-sort indices to avoid that each individual tree of the
             # ensemble sorts the indices.

diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
@@ -14,7 +14,6 @@
 # License: BSD 3 clause
 
 from abc import ABCMeta, abstractmethod
-import numbers
 import warnings
 
 import numpy as np
@@ -34,7 +33,7 @@
 from ..utils.fixes import sparse_lsqr
 from ..utils.seq_dataset import ArrayDataset32, CSRDataset32
 from ..utils.seq_dataset import ArrayDataset64, CSRDataset64
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted,  _check_sample_weight
 from ..preprocessing.data import normalize as f_normalize
 
 # TODO: bayesian_ridge_regression and bayesian_regression_ard
@@ -118,8 +117,8 @@ def _preprocess_data(X, y, fit_intercept, normalize=False, copy=True,
     centered. This function also systematically makes y consistent with X.dtype
     """
 
-    if isinstance(sample_weight, numbers.Number):
-        sample_weight = None
+    if sample_weight is not None:
+        sample_weight = _check_sample_weight(sample_weight, X)
 
     if check_input:
         X = check_array(X, copy=copy, accept_sparse=['csr', 'csc'],
@@ -467,8 +466,8 @@ def fit(self, X, y, sample_weight=None):
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          y_numeric=True, multi_output=True)
 
-        if sample_weight is not None and np.atleast_1d(sample_weight).ndim > 1:
-            raise ValueError("Sample weights must be 1D array or scalar")
+        if sample_weight is not None:
+            sample_weight = _check_sample_weight(sample_weight, X, X.dtype)
 
         X, y, X_offset, y_offset, X_scale = self._preprocess_data(
             X, y, fit_intercept=self.fit_intercept, normalize=self.normalize,

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -912,6 +912,12 @@ def test_check_sample_weight():
     sample_weight = _check_sample_weight(None, X, dtype=X.dtype)
     assert sample_weight.dtype == np.float64
 
+    # wrongly formated sample_weight
+    sample_weight = np.array(["1", "pi", "e"])
+    err_msg = "could not convert string to float: 'pi'"
+    with pytest.raises(ValueError, match=err_msg):
+        _check_sample_weight(sample_weight, X)
+
 
 @pytest.mark.parametrize("toarray", [
     np.array, sp.csr_matrix, sp.csc_matrix])

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -1025,28 +1025,44 @@ def _check_sample_weight(sample_weight, X, dtype=None):
     """
     n_samples = _num_samples(X)
 
-    if dtype is not None and dtype not in [np.float32, np.float64]:
+    # this check is needed to ensure that we don't change the dtype of
+    # of sample_weight if it's already np.float32.
+    # since sample_weight can be a list or an array, we first
+    # need to verify that it has a dtype attribute before the check.
+    # if dtype is None or any other type besides np.float32, np.float64
+    # is given.
+
+    if hasattr(sample_weight, "dtype"):
+        dtype = sample_weight.dtype
+
+    if dtype not in [np.float32, np.float64]:
         dtype = np.float64
 
     if sample_weight is None or isinstance(sample_weight, numbers.Number):
         if sample_weight is None:
             sample_weight = np.ones(n_samples, dtype=dtype)
-        else:
+        elif isinstance(sample_weight, numbers.Number):
             sample_weight = np.full(n_samples, sample_weight,
                                     dtype=dtype)
-    else:
-        if dtype is None:
-            dtype = [np.float64, np.float32]
-        sample_weight = check_array(
-                sample_weight, accept_sparse=False,
-                ensure_2d=False, dtype=dtype, order="C"
-        )
-        if sample_weight.ndim != 1:
-            raise ValueError("Sample weights must be 1D array or scalar")
-
-        if sample_weight.shape != (n_samples,):
-            raise ValueError("sample_weight.shape == {}, expected {}!"
-                             .format(sample_weight.shape, (n_samples,)))
+        return sample_weight
+
+    # at this point, sample_weight is either a list or
+    # an array. These checks will validate that the dtype
+    # of the returned sample_weight is either np.float32 or
+    # np.float64. If sample weight contained elements which
+    # cannot be passed safely to the above types, the
+    # following line will raise a ValueError
+    sample_weight = np.array(sample_weight, dtype=dtype)
+
+    # sample_weights must be 1-D arrays
+    if sample_weight.ndim != 1:
+        raise ValueError("Sample weights must be 1D array or scalar")
+
+    # and must have the same number of elements
+    # as X
+    if sample_weight.shape[0] != n_samples:
+        raise ValueError("sample_weight.shape == {}, expected {}!"
+                         .format(sample_weight.shape, (n_samples, )))
     return sample_weight