Skip to content

[MRG+1] Patch liblinear for sample_weights in LogisticRegression(and CV) #5274

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 23, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ Enhancements

- Added ``sample_weight`` support to :class:`linear_model.LogisticRegression` for
the ``lbfgs``, ``newton-cg``, and ``sag`` solvers. By `Valentin Stolbunov`_.
Support added to the ``liblinear`` solver. By `Manoj Kumar`_.

- Added optional parameter ``presort`` to :class:`ensemble.GradientBoostingRegressor`
and :class:`ensemble.GradientBoostingClassifier`, keeping default behavior
Expand Down
8 changes: 0 additions & 8 deletions sklearn/ensemble/bagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from ..externals.six.moves import zip
from ..metrics import r2_score, accuracy_score
from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
from ..linear_model import LogisticRegression
from ..utils import check_random_state, check_X_y, check_array, column_or_1d
from ..utils.random import sample_without_replacement
from ..utils.validation import has_fit_parameter, check_is_fitted
Expand Down Expand Up @@ -54,13 +53,6 @@ def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
bootstrap_features = ensemble.bootstrap_features
support_sample_weight = has_fit_parameter(ensemble.base_estimator_,
"sample_weight")
# Logistic regression does not support sample weights with liblinear
# TODO: Remove this check when liblinear is patched to support
# sample weights
if (isinstance(ensemble.base_estimator_, LogisticRegression) and
(ensemble.base_estimator_.solver == 'liblinear')):
support_sample_weight = False

if not support_sample_weight and sample_weight is not None:
raise ValueError("The base estimator doesn't support sample weight")

Expand Down
3 changes: 0 additions & 3 deletions sklearn/ensemble/tests/test_weight_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,9 +290,6 @@ def test_sample_weight_missing():
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans

clf = AdaBoostClassifier(LogisticRegression(), algorithm="SAMME")
assert_raises(ValueError, clf.fit, X, y_regr)

clf = AdaBoostClassifier(KMeans(), algorithm="SAMME")
assert_raises(ValueError, clf.fit, X, y_regr)

Expand Down
20 changes: 9 additions & 11 deletions sklearn/linear_model/logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ def hessp(v):
return grad, hessp


def _check_solver_option(solver, multi_class, penalty, dual, sample_weight):
def _check_solver_option(solver, multi_class, penalty, dual):
if solver not in ['liblinear', 'newton-cg', 'lbfgs', 'sag']:
raise ValueError("Logistic Regression supports only liblinear,"
" newton-cg, lbfgs and sag solvers, got %s" % solver)
Expand All @@ -420,10 +420,6 @@ def _check_solver_option(solver, multi_class, penalty, dual, sample_weight):
raise ValueError("Solver %s supports only "
"dual=False, got dual=%s" % (solver, dual))

if solver == 'liblinear' and sample_weight is not None:
raise ValueError("Solver %s does not support "
"sample weights." % solver)


def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
max_iter=100, tol=1e-4, verbose=0,
Expand Down Expand Up @@ -567,7 +563,7 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
if isinstance(Cs, numbers.Integral):
Cs = np.logspace(-4, 4, Cs)

_check_solver_option(solver, multi_class, penalty, dual, sample_weight)
_check_solver_option(solver, multi_class, penalty, dual)

# Preprocessing.
if check_input or copy:
Expand Down Expand Up @@ -712,7 +708,8 @@ def logistic_regression_path(X, y, pos_class=None, Cs=10, fit_intercept=True,
elif solver == 'liblinear':
coef_, intercept_, n_iter_i, = _fit_liblinear(
X, target, C, fit_intercept, intercept_scaling, class_weight,
penalty, dual, verbose, max_iter, tol, random_state)
penalty, dual, verbose, max_iter, tol, random_state,
sample_weight=sample_weight)
if fit_intercept:
w0 = np.concatenate([coef_.ravel(), intercept_])
else:
Expand Down Expand Up @@ -864,7 +861,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
n_iter : array, shape(n_cs,)
Actual number of iteration for each Cs.
"""
_check_solver_option(solver, multi_class, penalty, dual, sample_weight)
_check_solver_option(solver, multi_class, penalty, dual)

X_train = X[train]
X_test = X[test]
Expand Down Expand Up @@ -1134,13 +1131,14 @@ def fit(self, X, y, sample_weight=None):
n_samples, n_features = X.shape

_check_solver_option(self.solver, self.multi_class, self.penalty,
self.dual, sample_weight)
self.dual)

if self.solver == 'liblinear':
self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
X, y, self.C, self.fit_intercept, self.intercept_scaling,
self.class_weight, self.penalty, self.dual, self.verbose,
self.max_iter, self.tol, self.random_state)
self.max_iter, self.tol, self.random_state,
sample_weight=sample_weight)
self.n_iter_ = np.array([n_iter_])
return self

Expand Down Expand Up @@ -1482,7 +1480,7 @@ def fit(self, X, y, sample_weight=None):
Returns self.
"""
_check_solver_option(self.solver, self.multi_class, self.penalty,
self.dual, sample_weight)
self.dual)

if not isinstance(self.max_iter, numbers.Number) or self.max_iter < 0:
raise ValueError("Maximum number of iteration must be positive;"
Expand Down
65 changes: 46 additions & 19 deletions sklearn/linear_model/tests/test_logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,20 +574,20 @@ def test_logistic_regressioncv_class_weights():
def test_logistic_regression_sample_weights():
X, y = make_classification(n_samples=20, n_features=5, n_informative=3,
n_classes=2, random_state=0)
sample_weight = np.ones(y.shape[0])
sample_weight[y == 1] = 2

for LR in [LogisticRegression, LogisticRegressionCV]:
# Test that liblinear fails when sample weights are provided
clf_lib = LR(solver='liblinear')
assert_raises(ValueError, clf_lib.fit, X, y,
sample_weight=np.ones(y.shape[0]))

# Test that passing sample_weight as ones is the same as
# not passing them at all (default None)
clf_sw_none = LR(solver='lbfgs', fit_intercept=False)
clf_sw_none.fit(X, y)
clf_sw_ones = LR(solver='lbfgs', fit_intercept=False)
clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0]))
assert_array_almost_equal(clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4)
for solver in ['lbfgs', 'liblinear']:
clf_sw_none = LR(solver=solver, fit_intercept=False)
clf_sw_none.fit(X, y)
clf_sw_ones = LR(solver=solver, fit_intercept=False)
clf_sw_ones.fit(X, y, sample_weight=np.ones(y.shape[0]))
assert_array_almost_equal(
clf_sw_none.coef_, clf_sw_ones.coef_, decimal=4)

# Test that sample weights work the same with the lbfgs,
# newton-cg, and 'sag' solvers
Expand All @@ -598,20 +598,47 @@ def test_logistic_regression_sample_weights():
clf_sw_sag = LR(solver='sag', fit_intercept=False,
max_iter=2000, tol=1e-7)
clf_sw_sag.fit(X, y, sample_weight=y + 1)
assert_array_almost_equal(clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4)
assert_array_almost_equal(clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4)
clf_sw_liblinear = LR(solver='liblinear', fit_intercept=False,
max_iter=50, tol=1e-7)
clf_sw_liblinear.fit(X, y, sample_weight=y + 1)
assert_array_almost_equal(
clf_sw_lbfgs.coef_, clf_sw_n.coef_, decimal=4)
assert_array_almost_equal(
clf_sw_lbfgs.coef_, clf_sw_sag.coef_, decimal=4)
assert_array_almost_equal(
clf_sw_lbfgs.coef_, clf_sw_liblinear.coef_, decimal=4)

# Test that passing class_weight as [1,2] is the same as
# passing class weight = [1,1] but adjusting sample weights
# to be 2 for all instances of class 2
clf_cw_12 = LR(solver='lbfgs', fit_intercept=False,
class_weight={0: 1, 1: 2})
clf_cw_12.fit(X, y)
sample_weight = np.ones(y.shape[0])
sample_weight[y == 1] = 2
clf_sw_12 = LR(solver='lbfgs', fit_intercept=False)
clf_sw_12.fit(X, y, sample_weight=sample_weight)
assert_array_almost_equal(clf_cw_12.coef_, clf_sw_12.coef_, decimal=4)
for solver in ['lbfgs', 'liblinear']:
clf_cw_12 = LR(solver=solver, fit_intercept=False,
class_weight={0: 1, 1: 2})
clf_cw_12.fit(X, y)
clf_sw_12 = LR(solver=solver, fit_intercept=False)
clf_sw_12.fit(X, y, sample_weight=sample_weight)
assert_array_almost_equal(
clf_cw_12.coef_, clf_sw_12.coef_, decimal=4)

# Test the above for l1 penalty and l2 penalty with dual=True.
# since the patched liblinear code is different.
clf_cw = LogisticRegression(
solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2},
penalty="l1")
clf_cw.fit(X, y)
clf_sw = LogisticRegression(
solver="liblinear", fit_intercept=False, penalty="l1")
clf_sw.fit(X, y, sample_weight)
assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)

clf_cw = LogisticRegression(
solver="liblinear", fit_intercept=False, class_weight={0: 1, 1: 2},
penalty="l2", dual=True)
clf_cw.fit(X, y)
clf_sw = LogisticRegression(
solver="liblinear", fit_intercept=False, penalty="l2", dual=True)
clf_sw.fit(X, y, sample_weight)
assert_array_almost_equal(clf_cw.coef_, clf_sw.coef_, decimal=4)


def _compute_class_weight_dictionary(y):
Expand Down
16 changes: 13 additions & 3 deletions sklearn/svm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
from ..base import BaseEstimator, ClassifierMixin
from ..preprocessing import LabelEncoder
from ..multiclass import _ovr_decision_function
from ..utils import check_array, check_random_state, column_or_1d, check_X_y
from ..utils import check_array, check_consistent_length, check_random_state
from ..utils import column_or_1d, check_X_y
from ..utils import compute_class_weight, deprecated
from ..utils.extmath import safe_sparse_dot
from ..utils.validation import check_is_fitted
Expand Down Expand Up @@ -765,7 +766,8 @@ def _get_liblinear_solver_type(multi_class, penalty, loss, dual):
def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
penalty, dual, verbose, max_iter, tol,
random_state=None, multi_class='ovr',
loss='logistic_regression', epsilon=0.1):
loss='logistic_regression', epsilon=0.1,
sample_weight=None):
"""Used by Logistic Regression (and CV) and LinearSVC.

Preprocessing is done in this function before supplying it to liblinear.
Expand Down Expand Up @@ -840,6 +842,8 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,
that the value of this parameter depends on the scale of the target
variable y. If unsure, set epsilon=0.

sample_weight: array-like, optional
Weights assigned to each sample.

Returns
-------
Expand Down Expand Up @@ -886,11 +890,17 @@ def _fit_liblinear(X, y, C, fit_intercept, intercept_scaling, class_weight,

# LibLinear wants targets as doubles, even for classification
y_ind = np.asarray(y_ind, dtype=np.float64).ravel()
if sample_weight is None:
sample_weight = np.ones(X.shape[0])
else:
sample_weight = np.array(sample_weight, dtype=np.float64, order='C')
check_consistent_length(sample_weight, X)

solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
raw_coef_, n_iter_ = liblinear.train_wrap(
X, y_ind, sp.isspmatrix(X), solver_type, tol, bias, C,
class_weight_, max_iter, rnd.randint(np.iinfo('i').max),
epsilon)
epsilon, sample_weight)
# Regarding rnd.randint(..) in the above signature:
# seed for srand in range [0..INT_MAX); due to limitations in Numpy
# on 32-bit platforms, we can't get to the UINT_MAX limit that
Expand Down
Loading