scikit-learn · sklef · May 26, 2014 · May 26, 2014 · May 26, 2014 · Jul 1, 2014
diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst
@@ -1167,6 +1167,7 @@ Estimators
    svm.SVR
    svm.NuSVR
    svm.OneClassSVM
+   svm.SVDD
 
 .. autosummary::
    :toctree: generated/

diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst
@@ -617,6 +617,38 @@ bound of the fraction of support vectors.
 It can be shown that the `\nu`-SVC formulation is a reparametrization
 of the `C`-SVC and therefore mathematically equivalent.
 
+SVDD
+----
+
+Given vectors :math:`x_1, \cdots, x_l`, :class:`SVDD` build the smallest sphere
+around them solvng the problem:
+
+.. math:: 
+
+    \min R^2 + C\sum_{i = 1}^l\xi_i
+
+    \textrm {subject to } & \|x_i - a\| \leq R^2 + \xi_i\\
+    & \xi_i \geq 0, i=1, ..., n
+
+This problem is not convex, but it can be refolmulated as convex one:
+
+.. math:: 
+
+    \min \bar{R} + C\sum_{i = 1}^l\xi_i
+
+    \textrm {subject to } & \|x_i - a\| \leq \bar{R} + \xi_i\\
+    & \xi_i \geq 0, i=1, ..., n\\
+    & \bar{R} \geq 0
+
+.. note::
+
+    :math:`\frac{1}{C}` is approximate number of outliers in train set. 
+
+.. topic:: References:
+
+    * `"Support Vector Data Description"
+      <http://mediamatica.ewi.tudelft.nl/sites/default/files/ML_SVDD_04.pdf>`_
+      D. Tax, R. Duin, Machine Learning, 54, 45–66, 2004
 
 Implementation details
 ======================

diff --git a/examples/applications/plot_outlier_detection_housing.py b/examples/applications/plot_outlier_detection_housing.py
@@ -19,7 +19,7 @@
 able to focus on the main mode of the data distribution, it sticks to the
 assumption that the data should be Gaussian distributed, yielding some biased
 estimation of the data structure, but yet accurate to some extent.
-The One-Class SVM algorithm
+The One-Class SVM algorithm and Support Vector Data Description
 
 First example
 -------------
@@ -39,7 +39,7 @@
 distribution: the location seems to be well estimated, although the covariance
 is hard to estimate due to the banana-shaped distribution. Anyway, we can
 get rid of some outlying observations.
-The One-Class SVM is able to capture the real data structure, but the
+The One-Class SVM and SVDD are able to capture the real data structure, but the
 difficulty is to adjust its kernel bandwidth parameter so as to obtain
 a good compromise between the shape of the data scatter matrix and the
 risk of over-fitting the data.
@@ -52,7 +52,7 @@
 
 import numpy as np
 from sklearn.covariance import EllipticEnvelope
-from sklearn.svm import OneClassSVM
+from sklearn.svm import OneClassSVM, SVDD
 import matplotlib.pyplot as plt
 import matplotlib.font_manager
 from sklearn.datasets import load_boston
@@ -67,8 +67,9 @@
                                              contamination=0.261),
     "Robust Covariance (Minimum Covariance Determinant)":
     EllipticEnvelope(contamination=0.261),
-    "OCSVM": OneClassSVM(nu=0.261, gamma=0.05)}
-colors = ['m', 'g', 'b']
+    "OCSVM": OneClassSVM(nu=0.261, gamma=0.05),
+    "SVDD": SVDD(kernel='rbf', gamma = 0.03, C=0.01)}
+colors = ['m', 'g', 'b', 'y']
 legend1 = {}
 legend2 = {}
 
@@ -105,8 +106,9 @@
 plt.ylim((yy1.min(), yy1.max()))
 plt.legend((legend1_values_list[0].collections[0],
             legend1_values_list[1].collections[0],
-            legend1_values_list[2].collections[0]),
-           (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
+            legend1_values_list[2].collections[0],
+            legend1_values_list[3].collections[0]),
+           (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2], legend1_keys_list[3]),
            loc="upper center",
            prop=matplotlib.font_manager.FontProperties(size=12))
 plt.ylabel("accessibility to radial highways")
@@ -122,8 +124,9 @@
 plt.ylim((yy2.min(), yy2.max()))
 plt.legend((legend2_values_list[0].collections[0],
             legend2_values_list[1].collections[0],
-            legend2_values_list[2].collections[0]),
-           (legend2_values_list[0], legend2_values_list[1], legend2_values_list[2]),
+            legend2_values_list[2].collections[0],
+            legend2_values_list[3].collections[0]),
+           (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2], legend2_keys_list[3]),
            loc="upper center",
            prop=matplotlib.font_manager.FontProperties(size=12))
 plt.ylabel("% lower status of the population")

diff --git a/sklearn/svm/__init__.py b/sklearn/svm/__init__.py
@@ -10,7 +10,7 @@
 #         of their respective owners.
 # License: BSD 3 clause (C) INRIA 2010
 
-from .classes import SVC, NuSVC, SVR, NuSVR, OneClassSVM, LinearSVC
+from .classes import SVC, NuSVC, SVR, NuSVR, OneClassSVM, LinearSVC, SVDD
 from .bounds import l1_min_c
 from . import libsvm, liblinear, libsvm_sparse
 
@@ -20,6 +20,7 @@
            'OneClassSVM',
            'SVC',
            'SVR',
+           'SVDD',
            'l1_min_c',
            'liblinear',
            'libsvm',

diff --git a/sklearn/svm/base.py b/sklearn/svm/base.py
@@ -15,7 +15,7 @@
 from ..externals import six
 
 
-LIBSVM_IMPL = ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr']
+LIBSVM_IMPL = ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr', 'svdd']
 
 
 def _one_vs_one_coef(dual_coef, n_support, support_vectors):
@@ -143,11 +143,14 @@ def fit(self, X, y, sample_weight=None):
         solver_type = LIBSVM_IMPL.index(self._impl)
 
         # input validation
-        if solver_type != 2 and X.shape[0] != y.shape[0]:
+        if (solver_type not in [2, 5]) and X.shape[0] != y.shape[0]:
             raise ValueError("X and y have incompatible shapes.\n" +
                              "X has %s samples, but y has %s." %
                              (X.shape[0], y.shape[0]))
 
+        if (self.kernel == "precomputed" and solver_type == 5):
+            raise TypeError("SVDD does not support precomputed kernels")
+
         if self.kernel == "precomputed" and X.shape[0] != X.shape[1]:
             raise ValueError("X.shape[0] should be equal to X.shape[1]")
 

diff --git a/sklearn/svm/classes.py b/sklearn/svm/classes.py
@@ -709,6 +709,11 @@ class OneClassSVM(BaseLibSVM):
     `intercept_` : array, shape = [n_classes-1]
         Constants in decision function.
 
+    See also
+    --------
+    SVDD
+             Builds the smallest sphere around data set.
+
     """
     def __init__(self, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, tol=1e-3,
                  nu=0.5, shrinking=True, cache_size=200, verbose=False,
@@ -746,3 +751,131 @@ def fit(self, X, sample_weight=None, **params):
         super(OneClassSVM, self).fit(X, [], sample_weight=sample_weight,
                                      **params)
         return self
+
+
+class SVDD(BaseLibSVM):
+    """Support vectors data description.
+
+    Builds data envelope.
+
+    The implementation is based on libsvm.
+
+    Parameters
+    ----------
+    C : float, optional (default=1.0)
+        penalty parameter C of the error term. Should be in interval [1/l, 1]. 
+
+    kernel : string, optional (default='rbf')
+         Specifies the kernel type to be used in the algorithm.
+         It must be one of 'linear', 'poly', 'rbf' or 'sigmoid'.
+         If none is given, 'linear' will be used. Precomputed and callable kernels aren't supported.
+
+    degree : int, optional (default=3)
+        Degree of the polynomial kernel function ('poly').
+        Ignored by all other kernels.
+
+    gamma : float, optional (default=0.0)
+        Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
+        If gamma is 0.0 then 1/n_features will be used instead.
+
+    coef0 : float, optional (default=0.0)
+        Independent term in kernel function.
+        It is only significant in 'poly' and 'sigmoid'.
+
+    tol : float, optional
+        Tolerance for stopping criterion.
+
+    shrinking : boolean, optional
+        Whether to use the shrinking heuristic.
+
+    cache_size : float, optional
+        Specify the size of the kernel cache (in MB)
+
+    verbose : bool, default: False
+        Enable verbose output. Note that this setting takes advantage of a
+        per-process runtime setting in libsvm that, if enabled, may not work
+        properly in a multithreaded context.
+
+    max_iter : int, optional (default=-1)
+        Hard limit on iterations within solver, or -1 for no limit.
+
+    random_state : int seed, RandomState instance, or None (default)
+        The seed of the pseudo random number generator to use when
+        shuffling the data for probability estimation.
+
+    Attributes
+    ----------
+    `support_` : array-like, shape = [n_SV]
+        Index of support vectors.
+
+    `support_vectors_` : array-like, shape = [n_SV, n_features]
+        Support vectors.
+
+    `dual_coef_` : array, shape = [1, n_SV]
+        Coefficient of the support vector in the decision function.
+
+    `coef_` : array, shape = [1, n_features]
+        Weights asigned to the features (coefficients in the primal
+        problem). This is only available in the case of linear kernel.
+
+        `coef_` is readonly property derived from `dual_coef_` and
+        `support_vectors_`
+
+    `intercept_` : array, shape = [1]
+        Constants in decision function.
+
+
+    Examples
+    --------
+    >>> from sklearn.svm import SVDD
+    >>> import numpy as np
+    >>> train_x = np.array([[1, 1], [1, -1], [-1, 1], [-1, -1]])
+    >>> clf = SVDD(kernel='linear')
+    >>> clf.fit(train_x)
+    SVDD(C=1, cache_size=200, coef0=0.0, degree=3, gamma=0.0, kernel='linear',
+       max_iter=-1, random_state=None, shrinking=True, tol=0.001,
+       verbose=False)
+    >>> test_x = np.array([[0, 0], [4, 4]])
+    >>> clf.predict(test_x)
+    array([ 1., -1.])
+
+    See also
+    --------
+    OneClassSVM
+             Estimate the support of a high-dimensional distribution.
+
+
+    """
+    def __init__(self, kernel='linear', degree=3, gamma=0.0,
+                 coef0=0.0, tol=1e-3, C=1, shrinking=True, cache_size=200, 
+                 verbose=False, max_iter=-1, random_state=None):
+        super(SVDD, self).__init__(
+                'svdd', kernel, degree, gamma, coef0, tol, C, 0.0, 0.0,
+                shrinking, False, cache_size, None, verbose, max_iter,
+                random_state)
+
+    def fit(self, X, sample_weight=None, **params):
+        """
+        Builds data envelope.
+
+        Parameters
+        ----------
+        X : {array-like, sparse matrix}, shape (n_samples, n_features)
+            Set of samples, where n_samples is the number of samples and
+            n_features is the number of features.
+
+        Returns
+        -------
+        self : object
+            Returns self.
+
+        Notes
+        -----
+        If X is not a C-ordered contiguous array it is copied.
+
+        """
+
+        super(SVDD, self).fit(X, [], sample_weight=sample_weight,
+                                     **params)
+        return self
+