scikit-learn · agramfort · Oct 23, 2015 · Sep 15, 2015 · Sep 24, 2015 · Oct 19, 2015
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -250,6 +250,7 @@ Enhancements
 
    - Added ``sample_weight`` support to :class:`linear_model.LogisticRegression` for
      the ``lbfgs``, ``newton-cg``, and ``sag`` solvers. By `Valentin Stolbunov`_.
+     Support added to the ``liblinear`` solver. By `Manoj Kumar`_.
 
    - Added optional parameter ``presort`` to :class:`ensemble.GradientBoostingRegressor`
      and :class:`ensemble.GradientBoostingClassifier`, keeping default behavior

diff --git a/sklearn/ensemble/bagging.py b/sklearn/ensemble/bagging.py
@@ -17,7 +17,6 @@
 from ..externals.six.moves import zip
 from ..metrics import r2_score, accuracy_score
 from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
-from ..linear_model import LogisticRegression
 from ..utils import check_random_state, check_X_y, check_array, column_or_1d
 from ..utils.random import sample_without_replacement
 from ..utils.validation import has_fit_parameter, check_is_fitted
@@ -54,13 +53,6 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
     bootstrap_features = ensemble.bootstrap_features
     support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
                                               "sample_weight")
-    # Logistic regression does not support sample weights with liblinear
-    # TODO: Remove this check when liblinear is patched to support
-    #       sample weights
-    if (isinstance(ensemble.base_estimator_, LogisticRegression) and
-            (ensemble.base_estimator_.solver == 'liblinear')):
-        support_sample_weight = False
-
     if not support_sample_weight and sample_weight is not None:
         raise ValueError("The base estimator doesn't support sample weight")
 

diff --git a/sklearn/ensemble/tests/test_weight_boosting.py b/sklearn/ensemble/tests/test_weight_boosting.py
@@ -290,9 +290,6 @@ def test_sample_weight_missing():
     from sklearn.linear_model import LogisticRegression
     from sklearn.cluster import KMeans
 
-    clf = AdaBoostClassifier(LogisticRegression(), algorithm="SAMME")
-    assert_raises(ValueError, clf.fit, X, y_regr)
-
     clf = AdaBoostClassifier(KMeans(), algorithm="SAMME")
     assert_raises(ValueError, clf.fit, X, y_regr)
 

diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -399,7 +399,7 @@ def hessp(v):
     return grad, hessp
 
 
-def _check_solver_option(solver, multi_class, penalty, dual, sample_weight):
+def _check_solver_option(solver, multi_class, penalty, dual):
     if solver not in ['liblinear', 'newton-cg', 'lbfgs', 'sag']:
         raise ValueError("Logistic Regression supports only liblinear,"
                          " newton-cg, lbfgs and sag solvers, got %s" % solver)
@@ -420,10 +420,6 @@ def _check_solver_option(solver, multi_class, penalty, dual, sample_weight):
             raise ValueError("Solver %s supports only "
                              "dual=False, got dual=%s" % (solver, dual))
 
-    if solver == 'liblinear' and sample_weight is not None:
-        raise ValueError("Solver %s does not support "
-                         "sample weights." % solver)
-
 
 def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
                              max_iter=100, tol=1e-4, verbose=0,
@@ -567,7 +563,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
     if isinstance(Cs, numbers.Integral):
         Cs = np.logspace(-4, 4, Cs)
 
-    _check_solver_option(solver, multi_class, penalty, dual, sample_weight)
+    _check_solver_option(solver, multi_class, penalty, dual)
 
     # Preprocessing.
     if check_input or copy:
@@ -712,7 +708,8 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
         elif solver == 'liblinear':
             coef_, intercept_, n_iter_i, = _fit_liblinear(
                 X, target, C, fit_intercept, intercept_scaling, class_weight,
-                penalty, dual, verbose, max_iter, tol, random_state)
+                penalty, dual, verbose, max_iter, tol, random_state,
+                sample_weight=sample_weight)
             if fit_intercept:
                 w0 = np.concatenate([coef_.ravel(), intercept_])
             else:
@@ -864,7 +861,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
     n_iter : array, shape(n_cs,)
         Actual number of iteration for each Cs.
     """
-    _check_solver_option(solver, multi_class, penalty, dual, sample_weight)
+    _check_solver_option(solver, multi_class, penalty, dual)
 
     X_train = X[train]
     X_test = X[test]
@@ -1134,13 +1131,14 @@ def fit(self, X, y, sample_weight=None):
         n_samples, n_features = X.shape
 
         _check_solver_option(self.solver, self.multi_class, self.penalty,
-                             self.dual, sample_weight)
+                             self.dual)
 
         if self.solver == 'liblinear':
             self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
                 X, y, self.C, self.fit_intercept, self.intercept_scaling,
                 self.class_weight, self.penalty, self.dual, self.verbose,
-                self.max_iter, self.tol, self.random_state)
+                self.max_iter, self.tol, self.random_state,
+                sample_weight=sample_weight)
             self.n_iter_ = np.array([n_iter_])
             return self
 
@@ -1482,7 +1480,7 @@ def fit(self, X, y, sample_weight=None):
             Returns self.
         """
         _check_solver_option(self.solver, self.multi_class, self.penalty,
-                             self.dual, sample_weight)
+                             self.dual)
 
         if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
             raise ValueError("Maximum number of iteration must be positive;"

diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
@@ -574,20 +574,20 @@ def test_logistic_regressioncv_class_weights():
 def test_logistic_regression_sample_weights():
     X, y = make_classification(n_samples=20, n_features=5, n_informative=3,
                                n_classes=2, random_state=0)
+    sample_weight = np.ones(y.shape[0])
+    sample_weight[y == 1] = 2
 
     for LR in [LogisticRegression, LogisticRegressionCV]:
-        # Test that liblinear fails when sample weights are provided
-        clf_lib = LR(solver='liblinear')
-        assert_raises(ValueError, clf_lib.fit, X, y,
-                      sample_weight=np.ones(y.shape[0]))
 
         # Test that passing sample_weight as ones is the same as
         # not passing them at all (default None)
-        clf_sw_none = LR(solver='lbfgs', fit_intercept=False)
-        clf_sw_none.fit(X, y)
-        clf_sw_ones = LR(solver='lbfgs', fit_intercept=False)
-        clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0]))
-        assert_array_almost_equal(clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4)
+        for solver in ['lbfgs', 'liblinear']:
+            clf_sw_none = LR(solver=solver, fit_intercept=False)
+            clf_sw_none.fit(X, y)
+            clf_sw_ones = LR(solver=solver, fit_intercept=False)
+            clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0]))
+            assert_array_almost_equal(
+                clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4)
 
         # Test that sample weights work the same with the lbfgs,
         # newton-cg, and 'sag' solvers
@@ -598,20 +598,47 @@ def test_logistic_regression_sample_weights():
         clf_sw_sag = LR(solver='sag', fit_intercept=False,
                         max_iter=2000, tol=1e-7)
         clf_sw_sag.fit(X, y, sample_weight=y + 1)
-        assert_array_almost_equal(clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4)
-        assert_array_almost_equal(clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4)
+        clf_sw_liblinear = LR(solver='liblinear', fit_intercept=False,
+                              max_iter=50, tol=1e-7)
+        clf_sw_liblinear.fit(X, y, sample_weight=y + 1)
+        assert_array_almost_equal(
+            clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4)
+        assert_array_almost_equal(
+            clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4)
+        assert_array_almost_equal(
+            clf_sw_lbfgs.coef_, clf_sw_liblinear.coef_, decimal=4)
 
         # Test that passing class_weight as [1,2] is the same as
         # passing class weight = [1,1] but adjusting sample weights
         # to be 2 for all instances of class 2
-        clf_cw_12 = LR(solver='lbfgs', fit_intercept=False,
-                       class_weight={0: 1, 1: 2})
-        clf_cw_12.fit(X, y)
-        sample_weight = np.ones(y.shape[0])
-        sample_weight[y == 1] = 2
-        clf_sw_12 = LR(solver='lbfgs', fit_intercept=False)
-        clf_sw_12.fit(X, y, sample_weight=sample_weight)
-        assert_array_almost_equal(clf_cw_12.coef_, clf_sw_12.coef_, decimal=4)
+        for solver in ['lbfgs', 'liblinear']:
+            clf_cw_12 = LR(solver=solver, fit_intercept=False,
+                           class_weight={0: 1, 1: 2})
+            clf_cw_12.fit(X, y)
+            clf_sw_12 = LR(solver=solver, fit_intercept=False)
+            clf_sw_12.fit(X, y, sample_weight=sample_weight)
+            assert_array_almost_equal(
+                clf_cw_12.coef_, clf_sw_12.coef_, decimal=4)
+
+    # Test the above for l1 penalty and l2 penalty with dual=True.
+    # since the patched liblinear code is different.
+    clf_cw = LogisticRegression(
+        solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2},
+        penalty="l1")
+    clf_cw.fit(X, y)
+    clf_sw = LogisticRegression(
+        solver="liblinear", fit_intercept=False, penalty="l1")
+    clf_sw.fit(X, y, sample_weight)
+    assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)
+
+    clf_cw = LogisticRegression(
+        solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2},
+        penalty="l2", dual=True)
+    clf_cw.fit(X, y)
+    clf_sw = LogisticRegression(
+        solver="liblinear", fit_intercept=False, penalty="l2", dual=True)
+    clf_sw.fit(X, y, sample_weight)
+    assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)
 
 
 def _compute_class_weight_dictionary(y):

diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
@@ -10,7 +10,8 @@
 from ..base import BaseEstimator, ClassifierMixin
 from ..preprocessing import LabelEncoder
 from ..multiclass import _ovr_decision_function
-from ..utils import check_array, check_random_state, column_or_1d, check_X_y
+from ..utils import check_array, check_consistent_length, check_random_state
+from ..utils import column_or_1d, check_X_y
 from ..utils import compute_class_weight, deprecated
 from ..utils.extmath import safe_sparse_dot
 from ..utils.validation import check_is_fitted
@@ -765,7 +766,8 @@ def _get_liblinear_solver_type(multi_class, penalty, loss, dual):
 def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
                    penalty, dual, verbose, max_iter, tol,
                    random_state=None, multi_class='ovr',
-                   loss='logistic_regression', epsilon=0.1):
+                   loss='logistic_regression', epsilon=0.1,
+                   sample_weight=None):
     """Used by Logistic Regression (and CV) and LinearSVC.
 
     Preprocessing is done in this function before supplying it to liblinear.
@@ -840,6 +842,8 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
         that the value of this parameter depends on the scale of the target
         variable y. If unsure, set epsilon=0.
 
+    sample_weight: array-like, optional
+        Weights assigned to each sample.
 
     Returns
     -------
@@ -886,11 +890,17 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
 
     # LibLinear wants targets as doubles, even for classification
     y_ind = np.asarray(y_ind, dtype=np.float64).ravel()
+    if sample_weight is None:
+        sample_weight = np.ones(X.shape[0])
+    else:
+        sample_weight = np.array(sample_weight, dtype=np.float64, order='C')
+        check_consistent_length(sample_weight, X)
+
     solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
     raw_coef_, n_iter_ = liblinear.train_wrap(
         X, y_ind, sp.isspmatrix(X), solver_type, tol, bias, C,
         class_weight_, max_iter, rnd.randint(np.iinfo('i').max),
-        epsilon)
+        epsilon, sample_weight)
     # Regarding rnd.randint(..) in the above signature:
     # seed for srand in range [0..INT_MAX); due to limitations in Numpy
     # on 32-bit platforms, we can't get to the UINT_MAX limit that