Skip to content

Liblinear Sample Weights #2784

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions doc/modules/svm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -249,10 +249,10 @@ that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
:scale: 75


:class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR` and
:class:`OneClassSVM` implement also weights for individual samples in method
``fit`` through keyword ``sample_weight``. Similar to ``class_weight``, these
set the parameter ``C`` for the i-th example to ``C * sample_weight[i]``.
:class:`SVC`, :class:`LinearSVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR`
and :class:`OneClassSVM` also implement weights for individual samples in
method ``fit`` through keyword ``sample_weight``. Similar to ``class_weight``,
these set the parameter ``C`` for the i-th example to ``C * sample_weight[i]``.


.. figure:: ../auto_examples/svm/images/plot_weighted_samples_001.png
Expand Down
4 changes: 4 additions & 0 deletions doc/whats_new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,10 @@ Bug fixes
when fit with ``sample_weight != None`` and/or with ``bootstrap=True``.
By `Gilles Louppe`_.

- Added support for sample weights to :class:`LinearSVC <svm.LinearSVC>`. By
Gabriel Schwartz.


API changes summary
-------------------

Expand Down
2 changes: 2 additions & 0 deletions sklearn/linear_model/logistic.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
log_reg = LogisticRegression(fit_intercept=fit_intercept)
log_reg._enc = LabelEncoder()
log_reg._enc.fit_transform([-1, 1])
log_reg.classes_ = log_reg._enc.classes_

X_train = X[train]
X_test = X[test]
Expand Down Expand Up @@ -920,6 +921,7 @@ def fit(self, X, y):

self._enc = LabelEncoder()
self._enc.fit(y)
self.classes_ = self._enc.classes_

labels = self.classes_
n_classes = len(labels)
Expand Down
72 changes: 49 additions & 23 deletions sklearn/svm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,29 @@
LIBSVM_IMPL = ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr']


def _validate_targets_with_weight(clf, y, sample_weight):
y_ = column_or_1d(y, warn=True)
cls, y = np.unique(y_, return_inverse=True)

if sample_weight is not None:
sw = column_or_1d(sample_weight, warn=True)
cls = np.unique(y_[sw > 0])

if len(cls) < 2:
raise ValueError(
"The number of classes has to be greater than one; got %d"
% len(cls))

# This must be called here so that the class weight list doesn't contain
# weights for classes eliminated because they had no samples with > 0
# weight.
clf.class_weight_ = compute_class_weight(clf.class_weight, cls, y_)
clf.classes_ = cls

# LibLinear and LibSVM want targets as doubles, even for classification.
return np.asarray(y, dtype=np.float64, order='C')


def _one_vs_one_coef(dual_coef, n_support, support_vectors):
"""Generate primal coefficients from dual coefficients
for the one-vs-one multi class LibSVM in the case
Expand Down Expand Up @@ -135,7 +158,7 @@ def fit(self, X, y, sample_weight=None):
self._sparse = sparse and not callable(self.kernel)

X = check_array(X, accept_sparse='csr', dtype=np.float64, order='C')
y = self._validate_targets(y)
y = self._validate_targets(y, sample_weight)

sample_weight = np.asarray([]
if sample_weight is None
Expand Down Expand Up @@ -185,7 +208,7 @@ def fit(self, X, y, sample_weight=None):
self.intercept_ *= -1
return self

def _validate_targets(self, y):
def _validate_targets(self, y, sample_weight=None):
"""Validation of y and class_weight.

Default implementation for SVR and one-class; overridden in BaseSVC.
Expand Down Expand Up @@ -437,18 +460,8 @@ def coef_(self):
class BaseSVC(BaseLibSVM, ClassifierMixin):
"""ABC for LibSVM-based classifiers."""

def _validate_targets(self, y):
y_ = column_or_1d(y, warn=True)
cls, y = np.unique(y_, return_inverse=True)
self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)
if len(cls) < 2:
raise ValueError(
"The number of classes has to be greater than one; got %d"
% len(cls))

self.classes_ = cls

return np.asarray(y, dtype=np.float64, order='C')
def _validate_targets(self, y, sample_weight=None):
return _validate_targets_with_weight(self, y, sample_weight)

def predict(self, X):
"""Perform classification on samples in X.
Expand Down Expand Up @@ -656,7 +669,7 @@ def _get_solver_type(self):
self.loss, self.dual))
return self._solver_type_dict[solver_type]

def fit(self, X, y):
def fit(self, X, y, sample_weight=None):
"""Fit the model according to the given training data.

Parameters
Expand All @@ -668,6 +681,10 @@ def fit(self, X, y):
y : array-like, shape = [n_samples]
Target vector relative to X

sample_weight : array-like, shape = [n_samples]
Per-sample weights. Rescale C per sample. Higher weights force the
classifier to put more emphasis on these points.

Returns
-------
self : object
Expand All @@ -686,23 +703,32 @@ def fit(self, X, y):
raise ValueError("newton-cg and lbfgs solvers support only "
"the primal form.")

X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
y = self._validate_targets(y, sample_weight)

self._enc = LabelEncoder()
y_ind = self._enc.fit_transform(y)

if len(self.classes_) < 2:
raise ValueError("The number of classes has to be greater than"
" one.")

X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")

# Used in the liblinear solver.
self.class_weight_ = compute_class_weight(self.class_weight,
self.classes_, y)
sample_weight = np.asarray([]
if sample_weight is None
else sample_weight, dtype=np.float64)

if X.shape[0] != y_ind.shape[0]:
raise ValueError("X and y have incompatible shapes.\n"
"X has %s samples, but y has %s." %
(X.shape[0], y_ind.shape[0]))

if sample_weight.shape[0] > 0 and sample_weight.shape[0] != X.shape[0]:
raise ValueError("sample_weight and X have incompatible shapes: "
"%r vs %r\n"
"Note: Sparse matrices cannot be indexed w/"
"boolean masks (use `indices=True` in CV)."
% (sample_weight.shape, X.shape))

if self.solver not in ['liblinear', 'newton-cg', 'lbfgs']:
raise ValueError("Logistic Regression supports only liblinear,"
" newton-cg and lbfgs solvers.")
Expand All @@ -721,6 +747,7 @@ def fit(self, X, y):
self._get_solver_type(),
self.tol, self._get_bias(),
self.C, self.class_weight_,
sample_weight,
rnd.randint(np.iinfo('i').max))
# Regarding rnd.randint(..) in the above signature:
# seed for srand in range [0..INT_MAX); due to limitations in Numpy
Expand Down Expand Up @@ -773,9 +800,8 @@ def fit(self, X, y):

return self

@property
def classes_(self):
return self._enc.classes_
def _validate_targets(self, y, sample_weight=None):
return _validate_targets_with_weight(self, y, sample_weight)

def _get_bias(self):
if self.fit_intercept:
Expand Down
Loading