MAINT Common sample_weight validation (#14307)

rth · thomasjpfan · commit fb169cd2b51a · 2019-07-19T11:23:30.000-04:00
diff --git a/sklearn/cluster/k_means_.py b/sklearn/cluster/k_means_.py
@@ -27,7 +27,7 @@
 from ..utils import check_array
 from ..utils import gen_batches
 from ..utils import check_random_state
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import FLOAT_DTYPES
 from ..exceptions import ConvergenceWarning
 from . import _k_means
@@ -167,19 +167,19 @@ def _tolerance(X, tol):
     return np.mean(variances) * tol
 
 
-def _check_sample_weight(X, sample_weight):
+def _check_normalize_sample_weight(sample_weight, X):
     """Set sample_weight if None, and check for correct dtype"""
-    n_samples = X.shape[0]
-    if sample_weight is None:
-        return np.ones(n_samples, dtype=X.dtype)
-    else:
-        sample_weight = np.asarray(sample_weight)
-        if n_samples != len(sample_weight):
-            raise ValueError("n_samples=%d should be == len(sample_weight)=%d"
-                             % (n_samples, len(sample_weight)))
+
+    sample_weight_was_none = sample_weight is None
+
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+    if not sample_weight_was_none:
         # normalize the weights to sum up to n_samples
+        # an array of 1 (i.e. samples_weight is None) is already normalized
+        n_samples = len(sample_weight)
         scale = n_samples / sample_weight.sum()
-        return (sample_weight * scale).astype(X.dtype, copy=False)
+        sample_weight *= scale
+    return sample_weight
 
 
 def k_means(X, n_clusters, sample_weight=None, init='k-means++',
@@ -437,7 +437,7 @@ def _kmeans_single_elkan(X, sample_weight, n_clusters, max_iter=300,
     if verbose:
         print('Initialization complete')
 
-    checked_sample_weight = _check_sample_weight(X, sample_weight)
+    checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
     centers, labels, n_iter = k_means_elkan(X, checked_sample_weight,
                                             n_clusters, centers, tol=tol,
                                             max_iter=max_iter, verbose=verbose)
@@ -522,7 +522,7 @@ def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300,
     """
     random_state = check_random_state(random_state)
 
-    sample_weight = _check_sample_weight(X, sample_weight)
+    sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
     best_labels, best_inertia, best_centers = None, None, None
     # init
@@ -665,7 +665,7 @@ def _labels_inertia(X, sample_weight, x_squared_norms, centers,
         Sum of squared distances of samples to their closest cluster center.
     """
     n_samples = X.shape[0]
-    sample_weight = _check_sample_weight(X, sample_weight)
+    sample_weight = _check_normalize_sample_weight(sample_weight, X)
     # set the default value of centers to -1 to be able to detect any anomaly
     # easily
     labels = np.full(n_samples, -1, np.int32)
@@ -1495,7 +1495,7 @@ def fit(self, X, y=None, sample_weight=None):
             raise ValueError("n_samples=%d should be >= n_clusters=%d"
                              % (n_samples, self.n_clusters))
 
-        sample_weight = _check_sample_weight(X, sample_weight)
+        sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
         n_init = self.n_init
         if hasattr(self.init, '__array__'):
@@ -1644,7 +1644,7 @@ def _labels_inertia_minibatch(self, X, sample_weight):
         """
         if self.verbose:
             print('Computing label assignment and total inertia')
-        sample_weight = _check_sample_weight(X, sample_weight)
+        sample_weight = _check_normalize_sample_weight(sample_weight, X)
         x_squared_norms = row_norms(X, squared=True)
         slices = gen_batches(X.shape[0], self.batch_size)
         results = [_labels_inertia(X[s], sample_weight[s], x_squared_norms[s],
@@ -1679,7 +1679,7 @@ def partial_fit(self, X, y=None, sample_weight=None):
         if n_samples == 0:
             return self
 
-        sample_weight = _check_sample_weight(X, sample_weight)
+        sample_weight = _check_normalize_sample_weight(sample_weight, X)
 
         x_squared_norms = row_norms(X, squared=True)
         self.random_state_ = getattr(self, "random_state_",
diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
@@ -909,14 +909,15 @@ def test_sample_weight_length():
     # check that an error is raised when passing sample weights
     # with an incompatible shape
     km = KMeans(n_clusters=n_clusters, random_state=42)
-    assert_raises_regex(ValueError, r'len\(sample_weight\)', km.fit, X,
-                        sample_weight=np.ones(2))
+    msg = r'sample_weight.shape == \(2,\), expected \(100,\)'
+    with pytest.raises(ValueError, match=msg):
+        km.fit(X, sample_weight=np.ones(2))
 
 
-def test_check_sample_weight():
-    from sklearn.cluster.k_means_ import _check_sample_weight
+def test_check_normalize_sample_weight():
+    from sklearn.cluster.k_means_ import _check_normalize_sample_weight
     sample_weight = None
-    checked_sample_weight = _check_sample_weight(X, sample_weight)
+    checked_sample_weight = _check_normalize_sample_weight(sample_weight, X)
     assert _num_samples(X) == _num_samples(checked_sample_weight)
     assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
     assert X.dtype == checked_sample_weight.dtype
diff --git a/sklearn/linear_model/huber.py b/sklearn/linear_model/huber.py
@@ -8,8 +8,8 @@
 from ..base import BaseEstimator, RegressorMixin
 from .base import LinearModel
 from ..utils import check_X_y
-from ..utils import check_consistent_length
 from ..utils import axis0_safe_slice
+from ..utils.validation import _check_sample_weight
 from ..utils.extmath import safe_sparse_dot
 from ..utils.optimize import _check_optimize_result
 
@@ -255,11 +255,8 @@ def fit(self, X, y, sample_weight=None):
         X, y = check_X_y(
             X, y, copy=False, accept_sparse=['csr'], y_numeric=True,
             dtype=[np.float64, np.float32])
-        if sample_weight is not None:
-            sample_weight = np.array(sample_weight)
-            check_consistent_length(y, sample_weight)
-        else:
-            sample_weight = np.ones_like(y)
+
+        sample_weight = _check_sample_weight(sample_weight, X)
 
         if self.epsilon < 1.0:
             raise ValueError(
diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -30,7 +30,7 @@
 from ..utils.fixes import logsumexp
 from ..utils.optimize import newton_cg, _check_optimize_result
 from ..utils.validation import check_X_y
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils import deprecated
 from ..exceptions import ChangedBehaviorWarning
 from ..utils.multiclass import check_classification_targets
@@ -826,11 +826,8 @@ def _logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     # If sample weights exist, convert them to array (support for lists)
     # and check length
     # Otherwise set them to 1 for all examples
-    if sample_weight is not None:
-        sample_weight = np.array(sample_weight, dtype=X.dtype, order='C')
-        check_consistent_length(y, sample_weight)
-    else:
-        sample_weight = np.ones(X.shape[0], dtype=X.dtype)
+    sample_weight = _check_sample_weight(sample_weight, X,
+                                         dtype=X.dtype)
 
     # If class_weights is a dict (provided by the user), the weights
     # are assigned to the original labels. If it is "balanced", then
@@ -1133,9 +1130,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
     y_test = y[test]
 
     if sample_weight is not None:
-        sample_weight = check_array(sample_weight, ensure_2d=False)
-        check_consistent_length(y, sample_weight)
-
+        sample_weight = _check_sample_weight(sample_weight, X)
         sample_weight = sample_weight[train]
 
     coefs, Cs, n_iter = _logistic_regression_path(
diff --git a/sklearn/linear_model/ransac.py b/sklearn/linear_model/ransac.py
@@ -11,7 +11,7 @@
 from ..base import MultiOutputMixin
 from ..utils import check_random_state, check_array, check_consistent_length
 from ..utils.random import sample_without_replacement
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _check_sample_weight
 from .base import LinearRegression
 from ..utils.validation import has_fit_parameter
 from ..exceptions import ConvergenceWarning
@@ -324,8 +324,7 @@ def fit(self, X, y, sample_weight=None):
             raise ValueError("%s does not support sample_weight. Samples"
                              " weights are only used for the calibration"
                              " itself." % estimator_name)
-        if sample_weight is not None:
-            sample_weight = np.asarray(sample_weight)
+        sample_weight = _check_sample_weight(sample_weight, X)
 
         n_inliers_best = 1
         score_best = -np.inf
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
@@ -27,6 +27,7 @@
 from ..utils import check_consistent_length
 from ..utils import compute_sample_weight
 from ..utils import column_or_1d
+from ..utils.validation import _check_sample_weight
 from ..preprocessing import LabelBinarizer
 from ..model_selection import GridSearchCV
 from ..metrics.scorer import check_scoring
@@ -428,8 +429,7 @@ def _ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
                          " %d != %d" % (n_samples, n_samples_))
 
     if has_sw:
-        if np.atleast_1d(sample_weight).ndim > 1:
-            raise ValueError("Sample weights must be 1D array or scalar")
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
         if solver not in ['sag', 'saga']:
             # SAG supports sample_weight directly. For other solvers,
@@ -1424,9 +1424,8 @@ def fit(self, X, y, sample_weight=None):
                 "alphas must be positive. Got {} containing some "
                 "negative or null value instead.".format(self.alphas))
 
-        if sample_weight is not None and not isinstance(sample_weight, float):
-            sample_weight = check_array(sample_weight, ensure_2d=False,
-                                        dtype=X.dtype)
+        sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
+
         n_samples, n_features = X.shape
 
         X, y, X_offset, y_offset, X_scale = LinearModel._preprocess_data(
diff --git a/sklearn/linear_model/sag.py b/sklearn/linear_model/sag.py
@@ -12,6 +12,7 @@
 from .sag_fast import sag32, sag64
 from ..exceptions import ConvergenceWarning
 from ..utils import check_array
+from ..utils.validation import _check_sample_weight
 from ..utils.extmath import row_norms
 
 
@@ -251,8 +252,7 @@ def sag_solver(X, y, sample_weight=None, loss='log', alpha=1., beta=0.,
     n_classes = int(y.max()) + 1 if loss == 'multinomial' else 1
 
     # initialization
-    if sample_weight is None:
-        sample_weight = np.ones(n_samples, dtype=X.dtype, order='C')
+    sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
 
     if 'coef' in warm_start_mem.keys():
         coef_init = warm_start_mem['coef']
diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
@@ -18,7 +18,7 @@
 from ..utils import check_array, check_random_state, check_X_y
 from ..utils.extmath import safe_sparse_dot
 from ..utils.multiclass import _check_partial_fit_first_call
-from ..utils.validation import check_is_fitted
+from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..exceptions import ConvergenceWarning
 from ..model_selection import StratifiedShuffleSplit, ShuffleSplit
 
@@ -169,19 +169,6 @@ def _get_penalty_type(self, penalty):
         except KeyError:
             raise ValueError("Penalty %s is not supported. " % penalty)
 
-    def _validate_sample_weight(self, sample_weight, n_samples):
-        """Set the sample weight array."""
-        if sample_weight is None:
-            # uniform sample weights
-            sample_weight = np.ones(n_samples, dtype=np.float64, order='C')
-        else:
-            # user-provided array
-            sample_weight = np.asarray(sample_weight, dtype=np.float64,
-                                       order="C")
-        if sample_weight.shape[0] != n_samples:
-            raise ValueError("Shapes of X and sample_weight do not match.")
-        return sample_weight
-
     def _allocate_parameter_mem(self, n_classes, n_features, coef_init=None,
                                 intercept_init=None):
         """Allocate mem for parameters; initialize if provided."""
@@ -488,7 +475,7 @@ def _partial_fit(self, X, y, alpha, C,
         # Allocate datastructures from input arguments
         self._expanded_class_weight = compute_class_weight(self.class_weight,
                                                            self.classes_, y)
-        sample_weight = self._validate_sample_weight(sample_weight, n_samples)
+        sample_weight = _check_sample_weight(sample_weight, X)
 
         if getattr(self, "coef_", None) is None or coef_init is not None:
             self._allocate_parameter_mem(n_classes, n_features,
@@ -1095,9 +1082,9 @@ def _partial_fit(self, X, y, alpha, C, loss, learning_rate,
 
         n_samples, n_features = X.shape
 
-        # Allocate datastructures from input arguments
-        sample_weight = self._validate_sample_weight(sample_weight, n_samples)
+        sample_weight = _check_sample_weight(sample_weight, X)
 
+        # Allocate datastructures from input arguments
         if getattr(self, "coef_", None) is None:
             self._allocate_parameter_mem(1, n_features, coef_init,
                                          intercept_init)
diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
@@ -8,11 +8,12 @@
 from ..base import BaseEstimator, ClassifierMixin
 from ..preprocessing import LabelEncoder
 from ..utils.multiclass import _ovr_decision_function
-from ..utils import check_array, check_consistent_length, check_random_state
+from ..utils import check_array, check_random_state
 from ..utils import column_or_1d, check_X_y
 from ..utils import compute_class_weight
 from ..utils.extmath import safe_sparse_dot
 from ..utils.validation import check_is_fitted, _check_large_sparse
+from ..utils.validation import _check_sample_weight
 from ..utils.multiclass import check_classification_targets
 from ..exceptions import ConvergenceWarning
 from ..exceptions import NotFittedError
@@ -906,11 +907,9 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
     # LibLinear wants targets as doubles, even for classification
     y_ind = np.asarray(y_ind, dtype=np.float64).ravel()
     y_ind = np.require(y_ind, requirements="W")
-    if sample_weight is None:
-        sample_weight = np.ones(X.shape[0])
-    else:
-        sample_weight = np.array(sample_weight, dtype=np.float64, order='C')
-        check_consistent_length(sample_weight, X)
+
+    sample_weight = _check_sample_weight(sample_weight, X,
+                                         dtype=np.float64)
 
     solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
     raw_coef_, n_iter_ = liblinear.train_wrap(
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -20,6 +20,7 @@
 from sklearn.utils.testing import SkipTest
 from sklearn.utils.testing import assert_array_equal
 from sklearn.utils.testing import assert_allclose_dense_sparse
+from sklearn.utils.testing import assert_allclose
 from sklearn.utils import as_float_array, check_array, check_symmetric
 from sklearn.utils import check_X_y
 from sklearn.utils import deprecated
@@ -39,7 +40,8 @@
     check_memory,
     check_non_negative,
     _num_samples,
-    check_scalar)
+    check_scalar,
+    _check_sample_weight)
 import sklearn
 
 from sklearn.exceptions import NotFittedError
@@ -853,3 +855,40 @@ def test_check_scalar_invalid(x, target_name, target_type, min_val, max_val,
                      min_val=min_val, max_val=max_val)
     assert str(raised_error.value) == str(err_msg)
     assert type(raised_error.value) == type(err_msg)
+
+
+def test_check_sample_weight():
+    # check array order
+    sample_weight = np.ones(10)[::2]
+    assert not sample_weight.flags["C_CONTIGUOUS"]
+    sample_weight = _check_sample_weight(sample_weight, X=np.ones((5, 1)))
+    assert sample_weight.flags["C_CONTIGUOUS"]
+
+    # check None input
+    sample_weight = _check_sample_weight(None, X=np.ones((5, 2)))
+    assert_allclose(sample_weight, np.ones(5))
+
+    # check numbers input
+    sample_weight = _check_sample_weight(2.0, X=np.ones((5, 2)))
+    assert_allclose(sample_weight, 2 * np.ones(5))
+
+    # check wrong number of dimensions
+    with pytest.raises(ValueError,
+                       match="Sample weights must be 1D array or scalar"):
+        _check_sample_weight(np.ones((2, 4)), X=np.ones((2, 2)))
+
+    # check incorrect n_samples
+    msg = r"sample_weight.shape == \(4,\), expected \(2,\)!"
+    with pytest.raises(ValueError, match=msg):
+        _check_sample_weight(np.ones(4), X=np.ones((2, 2)))
+
+    # float32 dtype is preserved
+    X = np.ones((5, 2))
+    sample_weight = np.ones(5, dtype=np.float32)
+    sample_weight = _check_sample_weight(sample_weight, X)
+    assert sample_weight.dtype == np.float32
+
+    # int dtype will be converted to float64 instead
+    X = np.ones((5, 2), dtype=np.int)
+    sample_weight = _check_sample_weight(None, X, dtype=X.dtype)
+    assert sample_weight.dtype == np.float64
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py