scikit-learn · HapeMask · Jul 29, 2014 · Jan 22, 2014 · Jan 22, 2014 · Jan 22, 2014
diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
@@ -249,10 +249,10 @@ that sets the parameter ``C`` of class ``class_label`` to ``C * value``.
    :scale: 75
 
 
-:class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR` and
-:class:`OneClassSVM` implement also weights for individual samples in method
-``fit`` through keyword ``sample_weight``. Similar to ``class_weight``, these
-set the parameter ``C`` for the i-th example to ``C * sample_weight[i]``.
+:class:`SVC`, :class:`LinearSVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR`
+and :class:`OneClassSVM` also implement weights for individual samples in
+method ``fit`` through keyword ``sample_weight``. Similar to ``class_weight``,
+these set the parameter ``C`` for the i-th example to ``C * sample_weight[i]``.
 
 
 .. figure:: ../auto_examples/svm/images/plot_weighted_samples_001.png

diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -349,6 +349,10 @@ Bug fixes
      when fit with ``sample_weight != None`` and/or with ``bootstrap=True``.
      By `Gilles Louppe`_.
 
+   - Added support for sample weights to :class:`LinearSVC <svm.LinearSVC>`. By
+     Gabriel Schwartz.
+
+
 API changes summary
 -------------------
 

diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -520,6 +520,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10,
     log_reg = LogisticRegression(fit_intercept=fit_intercept)
     log_reg._enc = LabelEncoder()
     log_reg._enc.fit_transform([-1, 1])
+    log_reg.classes_ = log_reg._enc.classes_
 
     X_train = X[train]
     X_test = X[test]
@@ -920,6 +921,7 @@ def fit(self, X, y):
 
         self._enc = LabelEncoder()
         self._enc.fit(y)
+        self.classes_ = self._enc.classes_
 
         labels = self.classes_
         n_classes = len(labels)

diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
@@ -18,6 +18,29 @@
 LIBSVM_IMPL = ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr']
 
 
+def _validate_targets_with_weight(clf, y, sample_weight):
+    y_ = column_or_1d(y, warn=True)
+    cls, y = np.unique(y_, return_inverse=True)
+
+    if sample_weight is not None:
+        sw = column_or_1d(sample_weight, warn=True)
+        cls = np.unique(y_[sw > 0])
+
+    if len(cls) < 2:
+        raise ValueError(
+            "The number of classes has to be greater than one; got %d"
+            % len(cls))
+
+    # This must be called here so that the class weight list doesn't contain
+    # weights for classes eliminated because they had no samples with > 0
+    # weight.
+    clf.class_weight_ = compute_class_weight(clf.class_weight, cls, y_)
+    clf.classes_ = cls
+
+    # LibLinear and LibSVM want targets as doubles, even for classification.
+    return np.asarray(y, dtype=np.float64, order='C')
+
+
 def _one_vs_one_coef(dual_coef, n_support, support_vectors):
     """Generate primal coefficients from dual coefficients
     for the one-vs-one multi class LibSVM in the case
@@ -135,7 +158,7 @@ def fit(self, X, y, sample_weight=None):
         self._sparse = sparse and not callable(self.kernel)
 
         X = check_array(X, accept_sparse='csr', dtype=np.float64, order='C')
-        y = self._validate_targets(y)
+        y = self._validate_targets(y, sample_weight)
 
         sample_weight = np.asarray([]
                                    if sample_weight is None
@@ -185,7 +208,7 @@ def fit(self, X, y, sample_weight=None):
             self.intercept_ *= -1
         return self
 
-    def _validate_targets(self, y):
+    def _validate_targets(self, y, sample_weight=None):
         """Validation of y and class_weight.
 
         Default implementation for SVR and one-class; overridden in BaseSVC.
@@ -437,18 +460,8 @@ def coef_(self):
 class BaseSVC(BaseLibSVM, ClassifierMixin):
     """ABC for LibSVM-based classifiers."""
 
-    def _validate_targets(self, y):
-        y_ = column_or_1d(y, warn=True)
-        cls, y = np.unique(y_, return_inverse=True)
-        self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)
-        if len(cls) < 2:
-            raise ValueError(
-                "The number of classes has to be greater than one; got %d"
-                % len(cls))
-
-        self.classes_ = cls
-
-        return np.asarray(y, dtype=np.float64, order='C')
+    def _validate_targets(self, y, sample_weight=None):
+        return _validate_targets_with_weight(self, y, sample_weight)
 
     def predict(self, X):
         """Perform classification on samples in X.
@@ -656,7 +669,7 @@ def _get_solver_type(self):
                                 self.loss, self.dual))
         return self._solver_type_dict[solver_type]
 
-    def fit(self, X, y):
+    def fit(self, X, y, sample_weight=None):
         """Fit the model according to the given training data.
 
         Parameters
@@ -668,6 +681,10 @@ def fit(self, X, y):
         y : array-like, shape = [n_samples]
             Target vector relative to X
 
+        sample_weight : array-like, shape = [n_samples]
+            Per-sample weights. Rescale C per sample. Higher weights force the
+            classifier to put more emphasis on these points.
+
         Returns
         -------
         self : object
@@ -686,23 +703,32 @@ def fit(self, X, y):
                 raise ValueError("newton-cg and lbfgs solvers support only "
                                  "the primal form.")
 
+        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
+        y = self._validate_targets(y, sample_weight)
+
         self._enc = LabelEncoder()
         y_ind = self._enc.fit_transform(y)
+
         if len(self.classes_) < 2:
             raise ValueError("The number of classes has to be greater than"
                              " one.")
 
-        X = check_array(X, accept_sparse='csr', dtype=np.float64, order="C")
-
-        # Used in the liblinear solver.
-        self.class_weight_ = compute_class_weight(self.class_weight,
-                                                  self.classes_, y)
+        sample_weight = np.asarray([]
+                                   if sample_weight is None
+                                   else sample_weight, dtype=np.float64)
 
         if X.shape[0] != y_ind.shape[0]:
             raise ValueError("X and y have incompatible shapes.\n"
                              "X has %s samples, but y has %s." %
                              (X.shape[0], y_ind.shape[0]))
 
+        if sample_weight.shape[0] > 0 and sample_weight.shape[0] != X.shape[0]:
+            raise ValueError("sample_weight and X have incompatible shapes: "
+                             "%r vs %r\n"
+                             "Note: Sparse matrices cannot be indexed w/"
+                             "boolean masks (use `indices=True` in CV)."
+                             % (sample_weight.shape, X.shape))
+
         if self.solver not in ['liblinear', 'newton-cg', 'lbfgs']:
             raise ValueError("Logistic Regression supports only liblinear,"
                              " newton-cg and lbfgs solvers.")
@@ -721,6 +747,7 @@ def fit(self, X, y):
                                              self._get_solver_type(),
                                              self.tol, self._get_bias(),
                                              self.C, self.class_weight_,
+                                             sample_weight,
                                              rnd.randint(np.iinfo('i').max))
             # Regarding rnd.randint(..) in the above signature:
             # seed for srand in range [0..INT_MAX); due to limitations in Numpy
@@ -773,9 +800,8 @@ def fit(self, X, y):
 
         return self
 
-    @property
-    def classes_(self):
-        return self._enc.classes_
+    def _validate_targets(self, y, sample_weight=None):
+        return _validate_targets_with_weight(self, y, sample_weight)
 
     def _get_bias(self):
         if self.fit_intercept: