Merge pull request #4881 from sonnyhu/weighted_least_squares

GaelVaroquaux · GaelVaroquaux · commit 190abde442a8 · 2015-08-30T16:35:00.000+01:00
[MRG + 1] add sample_weight into LinearRegression
diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py
@@ -286,13 +286,13 @@ def test_base_estimator():
 
 
 def test_sample_weight_missing():
-    from sklearn.linear_model import LinearRegression
+    from sklearn.linear_model import LogisticRegression
     from sklearn.cluster import KMeans
 
-    clf = AdaBoostClassifier(LinearRegression(), algorithm="SAMME")
+    clf = AdaBoostClassifier(LogisticRegression(), algorithm="SAMME")
     assert_raises(ValueError, clf.fit, X, y_regr)
 
-    clf = AdaBoostRegressor(LinearRegression())
+    clf = AdaBoostRegressor(LogisticRegression())
     assert_raises(ValueError, clf.fit, X, y_regr)
 
     clf = AdaBoostClassifier(KMeans(), algorithm="SAMME")
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
@@ -25,7 +25,7 @@
 from ..externals import six
 from ..externals.joblib import Parallel, delayed
 from ..base import BaseEstimator, ClassifierMixin, RegressorMixin
-from ..utils import as_float_array, check_array, check_X_y, deprecated
+from ..utils import as_float_array, check_array, check_X_y, deprecated, column_or_1d
 from ..utils.extmath import safe_sparse_dot
 from ..utils.sparsefuncs import mean_variance_axis, inplace_column_scale
 from ..utils.fixes import sparse_lsqr
@@ -113,6 +113,18 @@ def center_data(X, y, fit_intercept, normalize=False, copy=True,
     return X, y, X_mean, y_mean, X_std
 
 
+def _rescale_data(X, y, sample_weight):
+    """Rescale data so as to support sample_weight"""
+    n_samples = X.shape[0]
+    sample_weight = sample_weight * np.ones(n_samples)
+    sample_weight = np.sqrt(sample_weight)
+    sw_matrix = sparse.dia_matrix((sample_weight, 0),
+                                  shape=(n_samples, n_samples))
+    X = safe_sparse_dot(sw_matrix, X)
+    y = safe_sparse_dot(sw_matrix, y)
+    return X, y
+
+
 class LinearModel(six.with_metaclass(ABCMeta, BaseEstimator)):
     """Base class for Linear Models"""
 
@@ -352,7 +364,7 @@ def __init__(self, fit_intercept=True, normalize=False, copy_X=True,
         self.copy_X = copy_X
         self.n_jobs = n_jobs
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """
         Fit linear model.
 
@@ -364,16 +376,28 @@ def fit(self, X, y):
         y : numpy array of shape [n_samples, n_targets]
             Target values
 
+        sample_weight : numpy array of shape [n_samples]
+            Individual weights for each sample
+
         Returns
         -------
         self : returns an instance of self.
         """
+
         n_jobs_ = self.n_jobs
         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                          y_numeric=True, multi_output=True)
+        
+        if ((sample_weight is not None) and np.atleast_1d(sample_weight).ndim > 1):
+            sample_weight = column_or_1d(sample_weight, warn=True)
 
         X, y, X_mean, y_mean, X_std = self._center_data(
-            X, y, self.fit_intercept, self.normalize, self.copy_X)
+            X, y, self.fit_intercept, self.normalize, self.copy_X,
+            sample_weight=sample_weight)
+
+        if sample_weight is not None:
+            # Sample weight can be implemented via a simple rescaling.
+            X, y = _rescale_data(X, y, sample_weight)
 
         if sp.issparse(X):
             if y.ndim < 2:
diff --git a/sklearn/linear_model/ridge.py b/sklearn/linear_model/ridge.py
@@ -17,7 +17,7 @@
 from scipy import sparse
 from scipy.sparse import linalg as sp_linalg
 
-from .base import LinearClassifierMixin, LinearModel
+from .base import LinearClassifierMixin, LinearModel, _rescale_data
 from ..base import RegressorMixin
 from ..utils.extmath import safe_sparse_dot
 from ..utils import check_X_y
@@ -185,17 +185,6 @@ def _solve_svd(X, y, alpha):
     return np.dot(Vt.T, d_UT_y).T
 
 
-def _rescale_data(X, y, sample_weight):
-    """Rescale data so as to support sample_weight"""
-    n_samples = X.shape[0]
-    sample_weight = sample_weight * np.ones(n_samples)
-    sample_weight = np.sqrt(sample_weight)
-    sw_matrix = sparse.dia_matrix((sample_weight, 0),
-                                  shape=(n_samples, n_samples))
-    X = safe_sparse_dot(sw_matrix, X)
-    y = safe_sparse_dot(sw_matrix, y)
-    return X, y
-
 
 def ridge_regression(X, y, alpha, sample_weight=None, solver='auto',
                      max_iter=None, tol=1e-3, verbose=0):
diff --git a/sklearn/linear_model/tests/test_base.py b/sklearn/linear_model/tests/test_base.py
@@ -10,8 +10,10 @@
 from sklearn.utils.testing import assert_equal
 
 from sklearn.linear_model.base import LinearRegression
-from sklearn.linear_model.base import center_data, sparse_center_data
+from sklearn.linear_model.base import center_data, sparse_center_data, _rescale_data
 from sklearn.utils import check_random_state
+from sklearn.utils.testing import assert_raise_message
+from sklearn.utils.testing import assert_greater
 from sklearn.datasets.samples_generator import make_sparse_uncorrelated
 from sklearn.datasets.samples_generator import make_regression
 
@@ -40,6 +42,55 @@ def test_linear_regression():
     assert_array_almost_equal(clf.predict(X), [0])
 
 
+def test_linear_regression_sample_weights():
+    rng = np.random.RandomState(0)
+
+    for n_samples, n_features in ((6, 5), (5, 10)):
+        y = rng.randn(n_samples)
+        X = rng.randn(n_samples, n_features)
+        sample_weight = 1.0 + rng.rand(n_samples)
+
+        clf = LinearRegression()
+        clf.fit(X, y, sample_weight)
+        coefs1 = clf.coef_
+
+        assert_equal(clf.coef_.shape, (X.shape[1], ))
+        assert_greater(clf.score(X, y), 0.9)
+        assert_array_almost_equal(clf.predict(X), y)
+
+        # Sample weight can be implemented via a simple rescaling
+        # for the square loss.
+        scaled_y = y * np.sqrt(sample_weight)
+        scaled_X = X * np.sqrt(sample_weight)[:, np.newaxis]
+        clf.fit(X, y)
+        coefs2 = clf.coef_
+
+        assert_array_almost_equal(coefs1, coefs2)
+
+
+def test_raises_value_error_if_sample_weights_greater_than_1d():
+    # Sample weights must be either scalar or 1D
+
+    n_sampless = [2, 3]
+    n_featuress = [3, 2]
+
+    rng = np.random.RandomState(42)
+
+    for n_samples, n_features in zip(n_sampless, n_featuress):
+        X = rng.randn(n_samples, n_features)
+        y = rng.randn(n_samples)
+        sample_weights_OK = rng.randn(n_samples) ** 2 + 1
+        sample_weights_OK_1 = 1.
+        sample_weights_OK_2 = 2.
+
+        clf = LinearRegression()
+
+        # make sure the "OK" sample weights actually work
+        clf.fit(X, y, sample_weights_OK)
+        clf.fit(X, y, sample_weights_OK_1)
+        clf.fit(X, y, sample_weights_OK_2)
+
+
 def test_fit_intercept():
     # Test assertions on betas shape.
     X2 = np.array([[0.38349978, 0.61650022],
@@ -255,3 +306,19 @@ def test_csr_sparse_center_data():
     csr = sparse.csr_matrix(X)
     csr_, y, _, _, _ = sparse_center_data(csr, y, True)
     assert_equal(csr_.getformat(), 'csr')
+
+
+def test_rescale_data():
+    n_samples = 200
+    n_features = 2
+
+    rng = np.random.RandomState(0)
+    sample_weight = 1.0 + rng.rand(n_samples)
+    X = rng.rand(n_samples, n_features)
+    y = rng.rand(n_samples)
+    rescaled_X, rescaled_y = _rescale_data(X, y, sample_weight)
+    rescaled_X2 = X * np.sqrt(sample_weight)[:, np.newaxis]
+    rescaled_y2 = y * np.sqrt(sample_weight)
+    assert_array_almost_equal(rescaled_X, rescaled_X2)
+    assert_array_almost_equal(rescaled_y, rescaled_y2)
+