From eb2d93aab2929239e586e2fb341a3a69fdc2345e Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Sat, 19 Nov 2016 14:44:48 +0300 Subject: [PATCH 01/41] ENH: nu-SVDD with sample weights, based on Chang, Lee, Lin (2013) --- doc/modules/classes.rst | 3 +- doc/modules/outlier_detection.rst | 46 +++++- doc/modules/svm.rst | 200 +++++++++++++++++++++++++- examples/svm/plot_oneclass.py | 6 +- examples/svm/plot_oneclass_vs_svdd.py | 102 +++++++++++++ sklearn/svm/__init__.py | 4 +- sklearn/svm/_base.py | 7 +- sklearn/svm/_classes.py | 151 ++++++++++++++++++- sklearn/svm/_libsvm.pyx | 12 ++ sklearn/svm/src/libsvm/svm.cpp | 147 +++++++++++++++++-- sklearn/svm/src/libsvm/svm.h | 2 +- sklearn/svm/tests/test_sparse.py | 18 +++ sklearn/svm/tests/test_svm.py | 89 ++++++++++++ 13 files changed, 762 insertions(+), 25 deletions(-) create mode 100644 examples/svm/plot_oneclass_vs_svdd.py diff --git a/doc/modules/classes.rst b/doc/modules/classes.rst index c6838556d50ad..97cc866780347 100644 --- a/doc/modules/classes.rst +++ b/doc/modules/classes.rst @@ -1529,9 +1529,10 @@ Estimators svm.LinearSVR svm.NuSVC svm.NuSVR - svm.OneClassSVM svm.SVC svm.SVR + svm.OneClassSVM + svm.SVDD .. autosummary:: :toctree: generated/ diff --git a/doc/modules/outlier_detection.rst b/doc/modules/outlier_detection.rst index 75a191a767aa5..4abb126cb6f22 100644 --- a/doc/modules/outlier_detection.rst +++ b/doc/modules/outlier_detection.rst @@ -157,8 +157,8 @@ coming from the same population than the initial observations. Otherwise, if they lay outside the frontier, we can say that they are abnormal with a given confidence in our assessment. -The One-Class SVM has been introduced by Schölkopf et al. for that purpose -and implemented in the :ref:`svm` module in the +The :ref:`svm_one_class_svm` has been introduced by Schölkopf et al. +for that purpose and implemented in the :ref:`svm` module in the :class:`svm.OneClassSVM` object. It requires the choice of a kernel and a scalar parameter to define a frontier. The RBF kernel is usually chosen although there exists no exact formula or algorithm to @@ -167,12 +167,29 @@ implementation. The `nu` parameter, also known as the margin of the One-Class SVM, corresponds to the probability of finding a new, but regular, observation outside the frontier. +The Support Vector Data Description (:ref:`svm_svdd`) is an alternative +model for estimating the support of a data distribution. It was proposed +by Tax and Duin, and later reformulated by Chang et al. The reparametrized +SVDD model, which has better parameter interpretability, is implemented +in the :class:`svm.SVDD` object in the :ref:`svm` module. The interface +as well as the interpretation of the parameters is similar to the +:ref:`svm_one_class_svm` model. + .. topic:: References: * `Estimating the support of a high-dimensional distribution `_ Schölkopf, Bernhard, et al. Neural computation 13.7 (2001): 1443-1471. + * `Support vector data description + `_ + Tax, and Duin. Machine learning, 54(1) (2004), pp.45-66. + + * `A revisit to support vector data description (SVDD). + `_ Chang, Lee, + and Lin. Technical Report (2013), Dept. of Computer Science, + National Taiwan University. + .. topic:: Examples: * See :ref:`sphx_glr_auto_examples_svm_plot_oneclass.py` for visualizing the @@ -415,3 +432,28 @@ Novelty detection with Local Outlier Factor is illustrated below. :target: ../auto_examples/neighbors/plot_lof_novelty_detection.html :align: center :scale: 75% + +.. _outlier_detection_ocsvm_vs_svdd: + +One-Class SVM versus SVDD-L1 +---------------------------- + +The :ref:`svm_one_class_svm` and :ref:`svm_svdd` models, though apparently +different, both try to construct a hypersurface, enveloping the densest regions +of the training sample. In the case of a stationary kernel :math:`K(x,y)=K(x-y)`, +such as RBF (see :ref:`svm_kernels`), for :math:`\nu\in (0,1)` the decision +functions are identical: + +.. figure:: ../auto_examples/svm/images/sphx_glr_plot_oneclass_vs_svdd_001.png + :target: ../auto_examples/svm/plot_oneclass_vs_svdd.html + :align: center + :scale: 75% + +But for a non-stationary kernel :math:`K(x,y)`, such as polynomial, the decision +functions may be dramatically different: + +.. figure:: ../auto_examples/svm/images/sphx_glr_plot_oneclass_vs_svdd_002.png + :target: ../auto_examples/svm/plot_oneclass_vs_svdd.html + :align: center + :scale: 75% + diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 75609adf38c9c..b2aa26d11bd3e 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -271,7 +271,7 @@ with and without weight correction. :class:`SVC`, :class:`NuSVC`, :class:`SVR`, :class:`NuSVR`, :class:`LinearSVC`, -:class:`LinearSVR` and :class:`OneClassSVM` implement also weights for +:class:`LinearSVR`, :class:`OneClassSVM` and :class:`SVDD` implement also weights for individual samples in the `fit` method through the ``sample_weight`` parameter. Similar to ``class_weight``, this sets the parameter ``C`` for the i-th example to ``C * sample_weight[i]``, which will encourage the classifier to @@ -339,6 +339,28 @@ Density estimation, novelty detection The class :class:`OneClassSVM` implements a One-Class SVM which is used in outlier detection. +:ref:`svm_one_class_svm` and :ref:`svm_svdd` models can be used for novelty +detection: given a set of samples, the model detects a soft boundary of that +set so as to classify new points as belonging to that set or not. The +classes that implement these models are :class:`OneClassSVM` and +:class:`SVDD` respectively. + +Since novelty detection is a type of unsupervised learning, the ``fit`` method +requires only an array X as input, as there are no class labels. + +See section :ref:`outlier_detection` for more details on this usage. + +.. figure:: ../auto_examples/svm/images/sphx_glr_plot_oneclass_001.png + :target: ../auto_examples/svm/plot_oneclass.html + :align: center + :scale: 75 + + +.. topic:: Examples: + + * :ref:`sphx_glr_auto_examples_svm_plot_oneclass.py` + * :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py` + See :ref:`outlier_detection` for the description and usage of OneClassSVM. Complexity @@ -422,8 +444,9 @@ Tips on Practical Use using a large stopping tolerance), the code without using shrinking may be much faster* - * Parameter ``nu`` in :class:`NuSVC`/:class:`OneClassSVM`/:class:`NuSVR` - approximates the fraction of training errors and support vectors. + * Parameter ``nu`` in :class:`NuSVC`, :class:`OneClassSVM`, :class:`NuSVR`, + and :class:`SVDD` approximates the fraction of training errors and support + vectors. * In :class:`SVC`, if the data is unbalanced (e.g. many positive and few negative), set ``class_weight='balanced'`` and/or try @@ -760,6 +783,177 @@ where we make use of the epsilon-insensitive loss, i.e. errors of less than :math:`\varepsilon` are ignored. This is the form that is directly optimized by :class:`LinearSVR`. +.. _svm_one_class_svm: + +One-Class SVM +------------- + +This model, proposed by Schölkopf et al. (2001), estimates the support +of a high-dimensional distribution by constructing a supporting hyperplane +in the feature space corresponding to the kernel, which effectively +separates the data set from the origin with maximum margin. + +For the training sample :math:`(x_i)_{i=1}^{n}` with weights :math:`(w_i)_{i=1}^{n}`, +:math:`\sum_{i=1}^{n} w_i>0`, the One-Class SVM solves the following primal problem: + + +.. math:: + + \min_{\rho,\xi,w} \frac12 w^Tw - \rho + \frac{1}{\nu W} \sum_{i=1}^{n} w_i \xi_i \,, \\ + + \textrm {subject to } & w^T\phi(x_i) \geq \rho - \xi_i \,, \\ + & \xi_i \geq 0\,,\, i=1, \ldots, n \,, + + +where :math:`\phi(\cdot)` is the feature map associated with the +kernel :math:`K(\cdot,\cdot)`, and :math:`W = \sum_{i=1}^{n} w_i`. + +The dual problem is + + +.. math:: + + \min_\alpha \frac12 \alpha^T Q\alpha\,\\ + + \textrm {subject to } & 0\leq \alpha_i \leq w_i\,,\, i=1, \ldots, n \,,\\ + & e^T\alpha = \nu W \,, + + +where :math:`e\in \mathbb{R}^{n\times 1}` is the vector of ones and +:math:`Q_{ij} = K(x_i, x_j)` is the kernel Gram matrix. + +The optimal decision function is given by: + +.. math:: x\mapsto \operatorname{sgn}(\sum_{i=1}^{n} \alpha_i K(x_i, x) - \rho) \,, + +where :math:`+1` indicates an inliner and :math:`-1` an outlier. + +The parameter :math:`\nu\in(0,1]` determines the fraction of outliers +in the training dataset. More technically :math:`\nu` is: + * an upper bound on the fraction of the training points lying outside + the estimated region; + + * a lower bound on the fraction of support vectors. + +.. topic:: References: + + * `Estimating the support of a high-dimensional distribution + `_ Schölkopf, + Bernhard, et al. Neural computation 13.7 (2001): 1443-1471. + doi:10.1162/089976601750264965 + + +.. _svm_svdd: + +SVDD +---- + +Support Vector Data Description (SVDD), proposed by Tax and Duin (2004), +aims at finding a spherically shaped boundary around a data set. Specifially, +it computes a minimum volume hypersphere containing the most of the data with +the number of outliers controlled by the parameter of the model. + +The original formulation suffered from non-convexity issues related to optimality of +the attained solution for certain values of the regularization parameter :math:`C`. +Chang, Lee, and Lin (2013) suggested a reformulation of the SVDD model +which had a well-defined and provably unique global solution for any :math:`C>0`. + +The implementation in the class :class:`SVDD` is based on a modified version +of the 2013 SVDD formulation. The following changes were made to problem (7) +in Chang et al. (2013): + + * **sample weights**: instead of a uniform penalty :math:`C>0` sample + observations are allowed to have different costs :math:`(C_i)_{i=1}^{n}`, + :math:`\sum_{i=1}^{n} C_i > 0`; + + * :math:`\nu`-**parametrization**: the penalties are determined by + :math:`C_i = \frac{w_i}{\nu \sum_{i=1}^{n} w_i}`, where :math:`\nu\in(0, 1]` + and :math:`(w_i)_{i=1}^{n}` are non-negative sample weights. + +Straightforward extension of theorems 2-4 of Chang et al. (2013) to the case +of different penalty yielded the :math:`\sum_{i=1}^{n} C_i > 1`, or equivalently +:math:`\nu < 1`, as the condition, which distinguishes the case of :math:`R>0` +(theorem 4 case 1) from :math:`R=0` (theorem 4 case 2). + +The main benefit of the :math:`\nu`-parametrization is a clearer interpretation +and a unified interface to the :ref:`svm_one_class_svm` model: :math:`\nu` is an +upper bound on the fraction of the training points lying outside the estimated +region, and a lower bound on the fraction of support vectors. Under the original +:math:`C`-parametrization the value :math:`\frac{1}{n C}` served as these bounds. + +Note that in a typical run of the SVDD model the weights are set to :math:`w_i = 1`, +which is equivalent to the original 2013 SVDD formulation for :math:`C = \frac{1}{\nu n}`. + +The primal problem of this modified version of SVDD for the training sample +:math:`(x_i)_{i=1}^{n}` with weights :math:`(w_i)_{i=1}^{n}`, +:math:`\sum_{i=1}^{n} w_i>0`, is: + + +.. math:: + + \min_{R,\xi,a} R + \frac{1}{\nu W} \sum_{i=1}^{n} w_i \xi_i\,,\\ + + \textrm {subject to } & \|\phi(x_i) - a\|^2 \leq R + \xi_i\,,\\ + & \xi_i \geq 0\,,\, i=1, \ldots, n\,,\\ + & R \geq 0\,, + + +where :math:`\phi(\cdot)` is the feature map associated with the kernel +:math:`K(\cdot,\cdot)`, and :math:`W = \sum_{i=1}^{n} w_i`. + +When :math:`\nu \geq 1`, the optimal :math:`R=0` and the primal problem +reduces to an unconstrained convex optimization problem independent of +:math:`\nu`: + +.. math :: \min_a \sum_{i=1}^{n} w_i \|\phi(x_i) - a\|^2\,. + +Note that in this case every observation is an outlier. + +In the case when :math:`\nu < 1` the constraint :math:`R\geq 0` is redundant, +strong duality holds, and the dual problem has the form: + + +.. math :: + + \min_\alpha \frac12 \alpha^T Q\alpha - \frac{\nu W}{2} \sum_{i=1}^{n} \alpha_i Q_{ii}\,,\\ + + \textrm {subject to } & 0 \leq \alpha_i \leq w_i\,,\, i=1, \ldots, n\,,\\ + & e^T \alpha = \nu W\,, + + +where :math:`e\in \mathbb{R}^{n\times 1}` is the vector of ones and +:math:`Q_{ij} = K(x_i, x_j)` is the kernel Gram matrix. + +The decision function of the SVDD is given by: + +.. math:: x\mapsto \operatorname{sgn}(R - \|\phi(x) - a\|^2) \,, + +where :math:`+1` indicates an inliner and :math:`-1` an outlier. The +distances in the feature space and :math:`R` are computed implicitly through +the coefficients and the optimal value of the objective of the corresponding +dual problem. + +It is worth noting, that in the case of a stationary kernel :math:`K(x,y)=K(x-y)` +the SVDD and One-Class SVM models are provably equivalent. Indeed, the values +:math:`Q_{ii} = K(x_i, x_i)` in the last term in the dual of the SVDD are all +equal to :math:`K(0)`, which makes the whole term independent of :math:`\alpha`. +Therefore the objective functions of the dual problems of the One-Class SVM +and the SVDD are equivalent up to a constant. This, however, **does not imply** +that one model generalizes the other: their solutions just happen to coincide +for a particular family of kernels (see :ref:`outlier_detection_ocsvm_vs_svdd`). + +.. topic:: References: + + * `Support vector data description + `_ + Tax, and Duin. Machine learning, 54(1) (2004), pp.45-66. + + * `A revisit to support vector data description (SVDD). + `_ Chang, Lee, + and Lin. Technical Report (2013), Dept. of Computer Science, + National Taiwan University. + + .. _svm_implementation_details: Implementation details diff --git a/examples/svm/plot_oneclass.py b/examples/svm/plot_oneclass.py index 082cbcd6de2be..7c30370324846 100644 --- a/examples/svm/plot_oneclass.py +++ b/examples/svm/plot_oneclass.py @@ -1,11 +1,11 @@ """ ========================================== -One-class SVM with non-linear kernel (RBF) +One-Class SVM with non-linear kernel (RBF) ========================================== -An example using a one-class SVM for novelty detection. +An example using a One-Class SVM for novelty detection. -:ref:`One-class SVM ` is an unsupervised +:ref:`One-Class SVM ` is an unsupervised algorithm that learns a decision function for novelty detection: classifying new data as similar or different to the training set. diff --git a/examples/svm/plot_oneclass_vs_svdd.py b/examples/svm/plot_oneclass_vs_svdd.py new file mode 100644 index 0000000000000..a2d20df63a72a --- /dev/null +++ b/examples/svm/plot_oneclass_vs_svdd.py @@ -0,0 +1,102 @@ +""" +========================= +One-Class SVM versus SVDD +========================= + +An example comparing the One-Class SVM and SVDD models for novelty +detection. + +:ref:`Support Vector Data Description (SVDD) ` +and :ref:`One-Class SVM ` are unsupervised +algorithms that learn a decision function for novelty detection, i.e +the problem of classifying new data as similar or different to the +training set. + +It can be shown that the One-Class SVM and SVDD models yield identical +results in the case of a stationary kernel, like RBF, but produce different +decision functions for non-stationary kernels, e.g. polynomial. This +example demonstrates this. + +Note, that it is incorrect to say that the SVDD generalizes the One-Class +SVM: these are different models, which just happen to coincide for a +particular family of kernels. +""" +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.font_manager +from sklearn import svm + +print(__doc__) + +random_state = np.random.RandomState(42) + +xx, yy = np.meshgrid(np.linspace(-7, 7, 501), np.linspace(-7, 7, 501)) +# Generate train data +X = 0.3 * random_state.randn(100, 2) +X_train = np.r_[X + 2, X - 2] +# Generate some regular novel observations +X = 0.3 * random_state.randn(20, 2) +X_test = np.r_[X + 2, X - 2] +# Generate some abnormal novel observations +X_outliers = random_state.uniform(low=-4, high=4, size=(20, 2)) + +# Define the models +nu = .1 +kernels = [("RBF", dict(kernel="rbf", gamma=0.1)), + ("Poly", dict(kernel="poly", degree=2, coef0=1.0)), + ] + +for kernel_name, kernel in kernels: + + # Use low tolerance to ensure better precision of the SVM + # optimization procedure. + classifiers = [("OCSVM", svm.OneClassSVM(nu=nu, tol=1e-8, **kernel)), + ("SVDD", svm.SVDD(nu=nu, tol=1e-8, **kernel)), + ] + + fig = plt.figure(figsize=(12, 5)) + fig.suptitle("One-Class SVM versus SVDD " + "(error train, error novel regular, error novel abnormal)") + + for i, (model_name, clf) in enumerate(classifiers): + clf.fit(X_train) + + y_pred_train = clf.predict(X_train) + y_pred_test = clf.predict(X_test) + y_pred_outliers = clf.predict(X_outliers) + n_error_train = y_pred_train[y_pred_train == -1].size + n_error_test = y_pred_test[y_pred_test == -1].size + n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size + + ax = fig.add_subplot(1, 2, i + 1) + + # plot the line, the points, and the nearest vectors to the plane + Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + + ax.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), + cmap=plt.cm.PuBu, zorder=-99) + ax.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred', + zorder=-98) + a = ax.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred', + zorder=-97, label="learned frontier") + + s = 40 + b1 = ax.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s) + b2 = ax.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s) + c = ax.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s) + ax.axis('tight') + ax.set_xlim((-7, 7)) + ax.set_ylim((-7, 7)) + + ax.set_title("%s %s (%d/200, %d/40, %d/40)" + % (model_name, kernel_name, n_error_train, + n_error_test, n_error_outliers)) + + ax.legend([a.collections[0], b1, b2, c], + ["learned frontier", "training observations", + "new regular observations", "new abnormal observations"], + loc="lower right", + prop=matplotlib.font_manager.FontProperties(size=10)) + +plt.show() diff --git a/sklearn/svm/__init__.py b/sklearn/svm/__init__.py index f5b4123230f93..34976e71e797a 100644 --- a/sklearn/svm/__init__.py +++ b/sklearn/svm/__init__.py @@ -10,7 +10,8 @@ # of their respective owners. # License: BSD 3 clause (C) INRIA 2010 -from ._classes import SVC, NuSVC, SVR, NuSVR, OneClassSVM, LinearSVC, LinearSVR +from ._classes import SVC, NuSVC, SVR, NuSVR, OneClassSVM, LinearSVC, \ + LinearSVR, SVDD from ._bounds import l1_min_c __all__ = [ @@ -19,6 +20,7 @@ "NuSVC", "NuSVR", "OneClassSVM", + "SVDD", "SVC", "SVR", "l1_min_c", diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index 3fb213f5ea20d..c18589d9a14bc 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -27,7 +27,8 @@ from ..exceptions import NotFittedError -LIBSVM_IMPL = ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr"] +LIBSVM_IMPL = ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr', + 'svdd_l1'] def _one_vs_one_coef(dual_coef, n_support, support_vectors): @@ -205,9 +206,9 @@ def fit(self, X, y, sample_weight=None): ) solver_type = LIBSVM_IMPL.index(self._impl) - # input validation + # input validation: novelty detection models not not use 'y' n_samples = _num_samples(X) - if solver_type != 2 and n_samples != y.shape[0]: + if solver_type not in (2, 5) and n_samples != y.shape[0]: raise ValueError( "X and y have incompatible shapes.\n" + "X has %s samples, but y has %s." % (n_samples, y.shape[0]) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index d1e59e7799b69..918e7f3f8a116 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1523,9 +1523,12 @@ def _more_tags(self): class OneClassSVM(OutlierMixin, BaseLibSVM): - """Unsupervised Outlier Detection. + """One-Class SVM for Unsupervised Outlier Detection. - Estimate the support of a high-dimensional distribution. + Estimate the support of a high-dimensional distribution by finding the + maximum margin soft boundary hyperplane separating a data set from the + origin. At most the fraction ``nu`` (``0 < nu <= 1``) of the data + are permitted to be outliers. The implementation is based on libsvm. @@ -1817,3 +1820,147 @@ def _more_tags(self): ), } } + + +class SVDD(BaseLibSVM): + """Support Vector Data Description (SVDD) for Unsupervised Outlier Detection. + + Estimate the support of a high-dimensional distribution by finding the + tightest soft boundary hypersphere around a data set, which permits at + most the fraction ``nu`` (``0 < nu <= 1``) of the data as outliers. + + The implementation is based on libsvm. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + kernel : string, optional (default='rbf') + Specifies the kernel type to be used in the algorithm. + It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' + or a callable. + If none is given, 'rbf' will be used. If a callable is given it is + used to precompute the kernel matrix. + + nu : float, optional + An upper bound on the fraction of training errors and a lower bound + of the fraction of support vectors. Should be in the interval (0, 1]. + By default 0.5 will be taken. + + degree : int, optional (default=3) + Degree of the polynomial kernel function ('poly'). + Ignored by all other kernels. + + gamma : float, optional (default='auto') + Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. + If gamma is 'auto' then 1/n_features will be used instead. + + coef0 : float, optional (default=0.0) + Independent term in kernel function. + It is only significant in 'poly' and 'sigmoid'. + + tol : float, optional + Tolerance for stopping criterion. + + shrinking : boolean, optional + Whether to use the shrinking heuristic. + + cache_size : float, optional + Specify the size of the kernel cache (in MB). + + verbose : bool, default: False + Enable verbose output. Note that this setting takes advantage of a + per-process runtime setting in libsvm that, if enabled, may not work + properly in a multithreaded context. + + max_iter : int, optional (default=-1) + Hard limit on iterations within solver, or -1 for no limit. + + random_state : int, RandomState instance or None, optional (default=None) + The seed of the pseudo random number generator to use when shuffling + the data. If int, random_state is the seed used by the random number + generator; If RandomState instance, random_state is the random number + generator; If None, the random number generator is the RandomState + instance used by `np.random`. + + Attributes + ---------- + support_ : array-like, shape = [n_SV] + Indices of support vectors. + + support_vectors_ : array-like, shape = [nSV, n_features] + Support vectors. + + dual_coef_ : array, shape = [n_classes-1, n_SV] + Coefficients of the support vectors in the decision function. + + coef_ : array, shape = [n_classes-1, n_features] + Weights assigned to the features (coefficients in the primal + problem). This is only available in the case of a linear kernel. + + `coef_` is readonly property derived from `dual_coef_` and + `support_vectors_` + + intercept_ : array, shape = [n_classes-1] + Constants in decision function. + + References + ---------- + .. [1] Tax, D.M. and Duin, R.P., 2004. "Support vector data + description." Machine learning, 54(1), pp.45-66. + doi:10.1023/B:MACH.0000008084.60811.49 + + .. [2] Chang, W.C., Lee, C.P. and Lin, C.J., 2013. "A revisit + to support vector data description (SVDD)." Technical + Report, Department of Computer Science, National Taiwan + University. + """ + def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0, + tol=1e-3, nu=0.5, shrinking=True, cache_size=200, + verbose=False, max_iter=-1, random_state=None): + super(SVDD, self).__init__( + 'svdd_l1', kernel, degree, gamma, coef0, tol, 0., nu, 0., + shrinking, False, cache_size, None, verbose, max_iter, + random_state) + + def fit(self, X, y=None, sample_weight=None, **params): + """Detects the soft minimum volume hypersphere around the sample X. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + Set of samples, where n_samples is the number of samples and + n_features is the number of features. + + sample_weight : array-like, shape (n_samples,) + Per-sample weights. Higher weights force the novelty detector + to put more emphasis on these points. + + Returns + ------- + self : object + Returns self. + + Notes + ----- + If X is not a C-ordered contiguous array it is copied. + + """ + super(SVDD, self).fit(X, np.ones(_num_samples(X)), + sample_weight=sample_weight, **params) + return self + + def decision_function(self, X): + """Distance of the samples X to the separating hyperplane. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + + Returns + ------- + X : array-like, shape (n_samples,) + Returns the decision function of the samples. + """ + dec = self._decision_function(X) + return dec diff --git a/sklearn/svm/_libsvm.pyx b/sklearn/svm/_libsvm.pyx index 89b36ddb3a813..4ca510c9dcdf3 100644 --- a/sklearn/svm/_libsvm.pyx +++ b/sklearn/svm/_libsvm.pyx @@ -74,9 +74,15 @@ def fit( Y : array, dtype=float64 of shape (n_samples,) target vector +<<<<<<< HEAD svm_type : {0, 1, 2, 3, 4}, default=0 Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR respectively. +======= + svm_type : {0, 1, 2, 3, 4, 5}, optional + Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR, NuSVR, or + SVDD-L1 respectively. 0 by default. +>>>>>>> ENH: nu-SVDD with sample weights, based on Chang, Lee, Lin (2013) kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf" Kernel to use in the model: linear, polynomial, RBF, sigmoid @@ -608,8 +614,14 @@ def cross_validation( Y : array, dtype=float of shape (n_samples,) target vector +<<<<<<< HEAD n_fold : int32 Number of folds for cross validation. +======= + svm_type : {0, 1, 2, 3, 4, 5} + Type of SVM: C SVC, nu SVC, one class, epsilon SVR, nu SVR, + or SVDD-L1. +>>>>>>> ENH: nu-SVDD with sample weights, based on Chang, Lee, Lin (2013) svm_type : {0, 1, 2, 3, 4}, default=0 Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp index de07fecdba2ac..8becae88ece14 100644 --- a/sklearn/svm/src/libsvm/svm.cpp +++ b/sklearn/svm/src/libsvm/svm.cpp @@ -1838,6 +1838,118 @@ static void solve_nu_svr( delete[] y; } +static void solve_svdd_l1( + const PREFIX(problem) *prob, const svm_parameter *param, + double *alpha, Solver::SolutionInfo* si) +{ + int l = prob->l; + int i, j; + + double r_square; + + ONE_CLASS_Q Q = ONE_CLASS_Q(*prob, *param); + + if(param->nu < 1) { + // case \nu < 1: the dual problem is + // min 0.5(\alpha^T Q \alpha) + (-0.5 \nu W diag Q)^T \alpha + // e^T \alpha = \nu W + // 0 <= alpha_i <= W_i + // W = sum W_i + + schar *ones = new schar[l]; + double *QD = new double[l]; + double *linear_term = new double[l]; + double *C = new double[l]; + + double nu_W = 0; + for(i=0;iW[i]; + nu_W += C[i] * param->nu; + } + + for(i=0;i 0) + { + alpha[i] = min(C[i], sum_alpha); + sum_alpha -= alpha[i]; + ++i; + } + for(;ieps, + si, param->shrinking, param->max_iter); + + // Compute R: the solver returns + // obj = 0.5 \alpha^T Q \alpha - 0.5 \nu W sum_i K_{ii}*\alpha_i + // rho = 0.5 \nu W (\alpha^T Q \alpha / (\nu W)^2 - R) + r_square = 2*(si->obj - nu_W * si->rho); + for(i=0;i= 1: then R = 0, and the SVDD-L1 problem is reduced to + // a quadratic problem with a unique solution independent of \nu. + // The centre of the sphere is the average of feature maps with weights W_i. + + info("*\nSVDD-L1 solution independent of nu\n"); + + double sum_W = 0; + for(i=0;iW[i]; + si->upper_bound[i] = prob->W[i]; + sum_W += prob->W[i]; + } + + // Simulate the run of the Solver by computing the objective + // and the intercept: + // obj = 0.5 \alpha^T Q \alpha - 0.5 W sum_i K_{ii}*\alpha_i + // rho = 0.5 \alpha^T Q \alpha / W + // note that \sum_i \alpha_i = W. + double rho = 0; + double obj = 0; + double sum; + for(i=0;iobj = rho + obj; + si->rho = rho / sum_W; + + si->solve_timed_out = false; + + r_square = 0.0; + } + + info("R^2 = %f\n",r_square); +} + // // decision_function // @@ -1876,6 +1988,10 @@ static decision_function svm_train_one( si.upper_bound = Malloc(double,2*prob->l); solve_nu_svr(prob,param,alpha,&si,blas_functions); break; + case SVDD_L1: + si.upper_bound = Malloc(double,prob->l); + solve_svdd_l1(prob,param,alpha,&si); + break; } *status |= si.solve_timed_out; @@ -2377,9 +2493,10 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p if(param->svm_type == ONE_CLASS || param->svm_type == EPSILON_SVR || - param->svm_type == NU_SVR) + param->svm_type == NU_SVR || + param->svm_type == SVDD_L1) { - // regression or one-class-svm + // regression or novelty detection model->nr_class = 2; model->label = NULL; model->nSV = NULL; @@ -2820,11 +2937,19 @@ double PREFIX(predict_values)(const PREFIX(model) *model, const PREFIX(node) *x, int i; if(model->param.svm_type == ONE_CLASS || model->param.svm_type == EPSILON_SVR || - model->param.svm_type == NU_SVR) + model->param.svm_type == NU_SVR || + model->param.svm_type == SVDD_L1) { double *sv_coef = model->sv_coef[0]; double sum = 0; - + + if(model->param.svm_type == SVDD_L1) + { + double K_xx = NAMESPACE::Kernel::k_function(x,x,model->param) / 2; + for(int i=0;il;i++) + sum -= sv_coef[i] * K_xx; + } + for(i=0;il;i++) #ifdef _DENSE_REP sum += sv_coef[i] * NAMESPACE::Kernel::k_function(x,model->SV+i,model->param,blas_functions); @@ -2834,7 +2959,8 @@ double PREFIX(predict_values)(const PREFIX(model) *model, const PREFIX(node) *x, sum -= model->rho[0]; *dec_values = sum; - if(model->param.svm_type == ONE_CLASS) + if(model->param.svm_type == ONE_CLASS || + model->param.svm_type == SVDD_L1) return (sum>0)?1:-1; else return sum; @@ -2906,7 +3032,8 @@ double PREFIX(predict)(const PREFIX(model) *model, const PREFIX(node) *x, BlasFu double *dec_values; if(model->param.svm_type == ONE_CLASS || model->param.svm_type == EPSILON_SVR || - model->param.svm_type == NU_SVR) + model->param.svm_type == NU_SVR || + model->param.svm_type == SVDD_L1) dec_values = Malloc(double, 1); else dec_values = Malloc(double, nr_class*(nr_class-1)/2); @@ -3024,7 +3151,8 @@ const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_param svm_type != NU_SVC && svm_type != ONE_CLASS && svm_type != EPSILON_SVR && - svm_type != NU_SVR) + svm_type != NU_SVR && + svm_type != SVDD_L1) return "unknown svm type"; // kernel_type, degree @@ -3059,7 +3187,8 @@ const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_param if(svm_type == NU_SVC || svm_type == ONE_CLASS || - svm_type == NU_SVR) + svm_type == NU_SVR || + svm_type == SVDD_L1) if(param->nu <= 0 || param->nu > 1) return "nu <= 0 or nu > 1"; @@ -3076,7 +3205,7 @@ const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_param return "probability != 0 and probability != 1"; if(param->probability == 1 && - svm_type == ONE_CLASS) + (svm_type == ONE_CLASS || svm_type == SVDD_L1)) return "one-class SVM probability output not supported yet"; diff --git a/sklearn/svm/src/libsvm/svm.h b/sklearn/svm/src/libsvm/svm.h index 518872c67bc5c..b4113d0ef24d2 100644 --- a/sklearn/svm/src/libsvm/svm.h +++ b/sklearn/svm/src/libsvm/svm.h @@ -40,7 +40,7 @@ struct svm_csr_problem }; -enum { C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR }; /* svm_type */ +enum { C_SVC, NU_SVC, ONE_CLASS, EPSILON_SVR, NU_SVR, SVDD_L1 }; /* svm_type */ enum { LINEAR, POLY, RBF, SIGMOID, PRECOMPUTED }; /* kernel_type */ struct svm_parameter diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py index 3bb6d0f268d07..5ffaf8f0af08c 100644 --- a/sklearn/svm/tests/test_sparse.py +++ b/sklearn/svm/tests/test_sparse.py @@ -335,6 +335,24 @@ def test_sparse_oneclasssvm(datasets_index, kernel): check_svm_model_equal(clf, sp_clf, *dataset) +def test_sparse_svdd(): + """Check that sparse SVDD gives the same result as dense SVDD + """ + # many class dataset: + X_blobs, _ = make_blobs(n_samples=100, centers=10, random_state=0) + X_blobs = sparse.csr_matrix(X_blobs) + + datasets = [[X_sp, None, T], [X2_sp, None, T2], + [X_blobs[:80], None, X_blobs[80:]], + [iris.data, None, iris.data]] + kernels = ["linear", "poly", "rbf", "sigmoid"] + for dataset in datasets: + for kernel in kernels: + clf = svm.SVDD(kernel=kernel, random_state=0) + sp_clf = svm.SVDD(kernel=kernel, random_state=0) + check_svm_model_equal(clf, sp_clf, *dataset) + + def test_sparse_realdata(): # Test on a subset from the 20newsgroups dataset. # This catches some bugs if input is not correctly converted into diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 9cc684d93ea71..b3b864826c546 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -362,6 +362,94 @@ def test_oneclass_fit_params_is_deprecated(): clf.fit(X, **params) +def test_svdd(): + # Test the output of libsvm for the SVDD problem with default parameters + clf = svm.SVDD() + clf.fit(X) + pred = clf.predict(T) + + assert_array_almost_equal(pred, [-1, -1, -1]) + assert_array_almost_equal(clf.intercept_, [0.491], decimal=3) + assert_array_almost_equal(clf.dual_coef_, + [[0.632, 0.233, 0.633, 0.234, 0.632, 0.633]], + decimal=3) + assert_false(hasattr(clf, "coef_")) + + +def test_svdd_decision_function(): + # For the RBF (stationary) kernel the SVDD and the OneClass SVM + # are identical. Therefore here the test is run on a non-stationary + # kernel. + + # Test SVDD decision function + rnd = check_random_state(2) + + # Generate train data + X = 0.3 * rnd.randn(100, 2) + X_train = np.r_[X + 2, X - 2] + + # Generate some regular novel observations + X = 0.3 * rnd.randn(20, 2) + X_test = np.r_[X + 2, X - 2] + + # Generate some abnormal novel observations + X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2)) + + # fit the model + clf = svm.SVDD(nu=0.1, kernel="poly", degree=2, coef0=1.0).fit(X_train) + + # predict and validate things + y_pred_test = clf.predict(X_test) + assert_greater(np.mean(y_pred_test == 1), .9) + + y_pred_outliers = clf.predict(X_outliers) + assert_greater(np.mean(y_pred_outliers == -1), .8) + + dec_func_test = clf.decision_function(X_test) + assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1) + + dec_func_outliers = clf.decision_function(X_outliers) + assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1) + + +def test_oneclass_and_svdd(): + # Generate a sample: two symmetrically placed clusters + rnd = check_random_state(2) + + X = 0.3 * rnd.randn(100, 2) + X_train = np.r_[X + 2, X - 2] + + # Test the output of libsvm for the SVDD and the One-Class SVM + nu = 0.15 + + svdd = svm.SVDD(nu=nu, kernel="rbf") + svdd.fit(X_train) + + ocsvm = svm.OneClassSVM(nu=nu, kernel="rbf") + ocsvm.fit(X_train) + + # The intercept of the SVDD differs from that of the One-Class SVM: + # `rho_svdd = (aTQa * (nu * l)^(-2) - R) * (nu * l) / 2` , + # and + # `rho_oc = (C0 + aTQa * (nu * l)^(-2) - R) * (nu * l) / 2` , + # since `R = C0 - 2 rho_oc / (nu l) + aTQa * (nu l)^(-2)`, + # where `C0 = K(x,x) = K(x-x)` for a stationary K. + # >>> The intercept_ value is negative rho! + # For the RBF kernel: K(x,y) = exp(-theta * |x-y|^2), the C0 is 1. + C0 = 1.0 + svdd_intercept = (2 * ocsvm.intercept_ + C0 * (nu * X_train.shape[0])) / 2 + assert_array_almost_equal(svdd.intercept_, svdd_intercept, decimal=3) + + # Evaluate the decision function on a uniformly spaced 2-d mesh + xx, yy = np.meshgrid(np.linspace(-5, 5, num=101), + np.linspace(-5, 5, num=101)) + mesh = np.c_[xx.ravel(), yy.ravel()] + + svdd_df = svdd.decision_function(mesh) + ocsvm_df = ocsvm.decision_function(mesh) + assert_array_almost_equal(svdd_df, ocsvm_df) + + def test_tweak_params(): # Make sure some tweaking of parameters works. # We change clf.dual_coef_ at run time and expect .predict() to change @@ -969,6 +1057,7 @@ def test_immutable_coef_property(): svm.SVR(kernel="linear").fit(iris.data, iris.target), svm.NuSVR(kernel="linear").fit(iris.data, iris.target), svm.OneClassSVM(kernel="linear").fit(iris.data), + svm.SVDD(kernel='linear').fit(iris.data), ] for clf in svms: with pytest.raises(AttributeError): From 6d373a62eb8bda0ac129e9b5d97979585f195f3b Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 27 Dec 2016 11:28:02 +0300 Subject: [PATCH 02/41] a Whatsnew entry and a minor comment fix in base.py --- doc/whats_new.rst | 5782 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 5782 insertions(+) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 3354a6b13f32b..a2e79cb930838 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -28,3 +28,5785 @@ on libraries.io to be notified when new versions are released. Version 0.14 Version 0.13 Older Versions + +Version 0.20 (under development) +================================ + +Changed models +-------------- + +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) + +Details are listed in the changelog below. + +(While we are trying to better inform users by providing this information, we +cannot assure that this list is complete.) + +Changelog +--------- + +New features +............ + +Classifiers and regressors + +- :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` now support early stopping + via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071` + by `Raghav RV`_ + +- Added :class:`naive_bayes.ComplementNB`, which implements the Complement + Naive Bayes classifier described in Rennie et al. (2003). + By :user:`Michael A. Alcorn `. + +Enhancements +............ + +Model evaluation and meta-estimators + +- A scorer based on :func:`metrics.brier_score_loss` is also available. + :issue:`9521` by :user:`Hanmin Qin `. + +Bug fixes +......... + +Decomposition, manifold learning and clustering + +- Fix for uninformative error in :class:`decomposition.incremental_pca`: + now an error is raised if the number of components is larger than the + chosen batch size. The ``n_components=None`` case was adapted accordingly. + :issue:`6452`. By :user:`Wally Gauze `. + +- Fixed a bug where the ``partial_fit`` method of + :class:`decomposition.IncrementalPCA` used integer division instead of float + division on Python 2 versions. :issue:`9492` by + :user:`James Bourbeau `. + +Version 0.19 +============ + +**Release Candidate (0.19b2) July 17, 2017** + +Highlights +---------- + +We are excited to release a number of great new features including +:class:`neighbors.LocalOutlierFactor` for anomaly detection, +:class:`preprocessing.QuantileTransformer` for robust feature transformation, +and the :class:`multioutput.ClassifierChain` meta-estimator to simply account +for dependencies between classes in multilabel problems. We have some new +algorithms in existing estimators, such as multiplicative update in +:class:`decomposition.NMF` and multinomial +:class:`linear_model.LogisticRegression` with L1 loss (use ``solver='saga'``). + +Cross validation is now able to return the results from multiple metric +evaluations. The new :func:`model_selection.cross_validate` can return many +scores on the test data as well as training set performance and timings, and we +have extended the ``scoring`` and ``refit`` parameters for grid/randomized +search :ref:`to handle multiple metrics `. + +You can also learn faster. For instance, the :ref:`new option to cache +transformations ` in :class:`pipeline.Pipeline` makes grid +search over pipelines including slow transformations much more efficient. And +you can predict faster: if you're sure you know what you're doing, you can turn +off validating that the input is finite using :func:`config_context`. + +We've made some important fixes too. We've fixed a longstanding implementation +error in :func:`metrics.average_precision_score`, so please be cautious with +prior results reported from that function. A number of errors in the +:class:`manifold.TSNE` implementation have been fixed, particularly in the +default Barnes-Hut approximation. :class:`semi_supervised.LabelSpreading` and +:class:`semi_supervised.LabelPropagation` have had substantial fixes. +LabelPropagation was previously broken. LabelSpreading should now correctly +respect its alpha parameter. + +Changed models +-------------- + +The following estimators and functions, when fit with the same data and +parameters, may produce different models from the previous version. This often +occurs due to changes in the modelling logic (bug fixes or enhancements), or in +random sampling procedures. + +- :class:`cluster.KMeans` with sparse X and initial centroids given (bug fix) +- :class:`cross_decomposition.PLSRegression` + with ``scale=True`` (bug fix) +- :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` where ``min_impurity_split`` is used (bug fix) +- gradient boosting ``loss='quantile'`` (bug fix) +- :class:`ensemble.IsolationForest` (bug fix) +- :class:`feature_selection.SelectFdr` (bug fix) +- :class:`linear_model.RANSACRegressor` (bug fix) +- :class:`linear_model.LassoLars` (bug fix) +- :class:`linear_model.LassoLarsIC` (bug fix) +- :class:`manifold.TSNE` (bug fix) +- :class:`neighbors.NearestCentroid` (bug fix) +- :class:`semi_supervised.LabelSpreading` (bug fix) +- :class:`semi_supervised.LabelPropagation` (bug fix) +- tree based models where ``min_weight_fraction_leaf`` is used (enhancement) + +Details are listed in the changelog below. + +(While we are trying to better inform users by providing this information, we +cannot assure that this list is complete.) + +Changelog +--------- + +New features +............ + +Classifiers and regressors + +- Added :class:`multioutput.ClassifierChain` for multi-label + classification. By `Adam Kleczewski `_. + +- Added solver ``'saga'`` that implements the improved version of Stochastic + Average Gradient, in :class:`linear_model.LogisticRegression` and + :class:`linear_model.Ridge`. It allows the use of L1 penalty with + multinomial logistic loss, and behaves marginally better than 'sag' + during the first epochs of ridge and logistic regression. + :issue:`8446` by `Arthur Mensch`_. + +Other estimators + +- Added the :class:`neighbors.LocalOutlierFactor` class for anomaly + detection based on nearest neighbors. + :issue:`5279` by `Nicolas Goix`_ and `Alexandre Gramfort`_. + +- Added :class:`preprocessing.QuantileTransformer` class and + :func:`preprocessing.quantile_transform` function for features + normalization based on quantiles. + :issue:`8363` by :user:`Denis Engemann `, + :user:`Guillaume Lemaitre `, `Olivier Grisel`_, `Raghav RV`_, + :user:`Thierry Guillemot `, and `Gael Varoquaux`_. + +- The new solver ``'mu'`` implements a Multiplicate Update in + :class:`decomposition.NMF`, allowing the optimization of all + beta-divergences, including the Frobenius norm, the generalized + Kullback-Leibler divergence and the Itakura-Saito divergence. + :issue:`5295` by `Tom Dupre la Tour`_. + +Model selection and evaluation + +- :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` now support simultaneous + evaluation of multiple metrics. Refer to the + :ref:`multimetric_grid_search` section of the user guide for more + information. :issue:`7388` by `Raghav RV`_ + +- Added the :func:`model_selection.cross_validate` which allows evaluation + of multiple metrics. This function returns a dict with more useful + information from cross-validation such as the train scores, fit times and + score times. + Refer to :ref:`multimetric_cross_validation` section of the userguide + for more information. :issue:`7388` by `Raghav RV`_ + +- Added :func:`metrics.mean_squared_log_error`, which computes + the mean square error of the logarithmic transformation of targets, + particularly useful for targets with an exponential trend. + :issue:`7655` by :user:`Karan Desai `. + +- Added :func:`metrics.dcg_score` and :func:`metrics.ndcg_score`, which + compute Discounted cumulative gain (DCG) and Normalized discounted + cumulative gain (NDCG). + :issue:`7739` by :user:`David Gasquez `. + +- Added the :class:`model_selection.RepeatedKFold` and + :class:`model_selection.RepeatedStratifiedKFold`. + :issue:`8120` by `Neeraj Gangwar`_. + - :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` now support simultaneous + evaluation of multiple metrics. Refer to the + :ref:`multimetric_grid_search` section of the user guide for more + information. :issue:`7388` by `Raghav RV`_ + + - Added the :func:`model_selection.cross_validate` which allows evaluation + of multiple metrics. This function returns a dict with more useful + information from cross-validation such as the train scores, fit times and + score times. + Refer to :ref:`multimetric_cross_validation` section of the userguide + for more information. :issue:`7388` by `Raghav RV`_ + + - Added :func:`metrics.mean_squared_log_error`, which computes + the mean square error of the logarithmic transformation of targets, + particularly useful for targets with an exponential trend. + :issue:`7655` by :user:`Karan Desai `. + + - Added :func:`metrics.dcg_score` and :func:`metrics.ndcg_score`, which + compute Discounted cumulative gain (DCG) and Normalized discounted + cumulative gain (NDCG). + :issue:`7739` by :user:`David Gasquez `. + By `Arthur Mensch`_. + - Added the :class:`svm.SVDD` class for novelty detection based on + soft minimal volume hypersphere around the sample data. + By `Ivan Nazarov`_. + + - Added the :class:`model_selection.RepeatedKFold` and + :class:`model_selection.RepeatedStratifiedKFold`. + :issue:`8120` by `Neeraj Gangwar`_. + +Miscellaneous + +- Validation that input data contains no NaN or inf can now be suppressed + using :func:`config_context`, at your own risk. This will save on runtime, + and may be particularly useful for prediction time. :issue:`7548` by + `Joel Nothman`_. + +- Added a test to ensure parameter listing in docstrings match the + function/class signature. :issue:`9206` by `Alexandre Gramfort`_ and + `Raghav RV`_. + +Enhancements +............ + +Trees and ensembles + +- The ``min_weight_fraction_leaf`` constraint in tree construction is now + more efficient, taking a fast path to declare a node a leaf if its weight + is less than 2 * the minimum. Note that the constructed tree will be + different from previous versions where ``min_weight_fraction_leaf`` is + used. :issue:`7441` by :user:`Nelson Liu `. + +- :class:`ensemble.GradientBoostingClassifier` and :class:`ensemble.GradientBoostingRegressor` + now support sparse input for prediction. + :issue:`6101` by :user:`Ibraim Ganiev `. + +- :class:`ensemble.VotingClassifier` now allows changing estimators by using + :meth:`ensemble.VotingClassifier.set_params`. An estimator can also be + removed by setting it to ``None``. + :issue:`7674` by :user:`Yichuan Liu `. + +- :func:`tree.export_graphviz` now shows configurable number of decimal + places. :issue:`8698` by :user:`Guillaume Lemaitre `. + +- Added ``flatten_transform`` parameter to :class:`ensemble.VotingClassifier` + to change output shape of `transform` method to 2 dimensional. + :issue:`7794` by :user:`Ibraim Ganiev ` and + :user:`Herilalaina Rakotoarison `. + +Linear, kernelized and related models + +- :class:`linear_model.SGDClassifier`, :class:`linear_model.SGDRegressor`, + :class:`linear_model.PassiveAggressiveClassifier`, + :class:`linear_model.PassiveAggressiveRegressor` and + :class:`linear_model.Perceptron` now expose ``max_iter`` and + ``tol`` parameters, to handle convergence more precisely. + ``n_iter`` parameter is deprecated, and the fitted estimator exposes + a ``n_iter_`` attribute, with actual number of iterations before + convergence. :issue:`5036` by `Tom Dupre la Tour`_. + +- Added ``average`` parameter to perform weight averaging in + :class:`linear_model.PassiveAggressiveClassifier`. :issue:`4939` + by :user:`Andrea Esuli `. + +- :class:`linear_model.RANSACRegressor` no longer throws an error + when calling ``fit`` if no inliers are found in its first iteration. + Furthermore, causes of skipped iterations are tracked in newly added + attributes, ``n_skips_*``. + :issue:`7914` by :user:`Michael Horrell `. + +- In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict`` + is a lot faster with ``return_std=True``. :issue:`8591` by + :user:`Hadrien Bertrand `. + +- Added ``return_std`` to ``predict`` method of + :class:`linear_model.ARDRegression` and + :class:`linear_model.BayesianRidge`. + :issue:`7838` by :user:`Sergey Feldman `. + +- Memory usage enhancements: Prevent cast from float32 to float64 in: + :class:`linear_model.MultiTaskElasticNet`; + :class:`linear_model.LogisticRegression` when using newton-cg solver; and + :class:`linear_model.Ridge` when using svd, sparse_cg, cholesky or lsqr + solvers. :issue:`8835`, :issue:`8061` by :user:`Joan Massich ` and :user:`Nicolas + Cordier ` and :user:`Thierry Guillemot `. + +Other predictors + +- Custom metrics for the :mod:`neighbors` binary trees now have + fewer constraints: they must take two 1d-arrays and return a float. + :issue:`6288` by `Jake Vanderplas`_. + +- ``algorithm='auto`` in :mod:`neighbors` estimators now chooses the most + appropriate algorithm for all input types and metrics. :issue:`9145` by + :user:`Herilalaina Rakotoarison ` and :user:`Reddy Chinthala + `. + +Decomposition, manifold learning and clustering + +- :class:`cluster.MiniBatchKMeans` and :class:`cluster.KMeans` + now use significantly less memory when assigning data points to their + nearest cluster center. :issue:`7721` by :user:`Jon Crall `. + +- :class:`decomposition.PCA`, :class:`decomposition.IncrementalPCA` and + :class:`decomposition.TruncatedSVD` now expose the singular values + from the underlying SVD. They are stored in the attribute + ``singular_values_``, like in :class:`decomposition.IncrementalPCA`. + :issue:`7685` by :user:`Tommy Löfstedt ` + +- :class:`decomposition.NMF` now faster when ``beta_loss=0``. + :issue:`9277` by :user:`hongkahjun`. + +- Memory improvements for method ``barnes_hut`` in :class:`manifold.TSNE` + :issue:`7089` by :user:`Thomas Moreau ` and `Olivier Grisel`_. + +- Optimization schedule improvements for Barnes-Hut :class:`manifold.TSNE` + so the results are closer to the one from the reference implementation + `lvdmaaten/bhtsne `_ by :user:`Thomas + Moreau ` and `Olivier Grisel`_. + +- Memory usage enhancements: Prevent cast from float32 to float64 in + :class:`decomposition.PCA` and + :func:`decomposition.randomized_svd_low_rank`. + :issue:`9067` by `Raghav RV`_. + +Preprocessing and feature selection + +- Added ``norm_order`` parameter to :class:`feature_selection.SelectFromModel` + to enable selection of the norm order when ``coef_`` is more than 1D. + :issue:`6181` by :user:`Antoine Wendlinger `. + +- Added ability to use sparse matrices in :func:`feature_selection.f_regression` + with ``center=True``. :issue:`8065` by :user:`Daniel LeJeune `. + +- Small performance improvement to n-gram creation in + :mod:`feature_extraction.text` by binding methods for loops and + special-casing unigrams. :issue:`7567` by :user:`Jaye Doepke ` + +- Relax assumption on the data for the + :class:`kernel_approximation.SkewedChi2Sampler`. Since the Skewed-Chi2 + kernel is defined on the open interval :math:`(-skewedness; +\infty)^d`, + the transform function should not check whether ``X < 0`` but whether ``X < + -self.skewedness``. :issue:`7573` by :user:`Romain Brault `. + +- Made default kernel parameters kernel-dependent in + :class:`kernel_approximation.Nystroem`. + :issue:`5229` by :user:`Saurabh Bansod ` and `Andreas Müller`_. + +Model evaluation and meta-estimators + +- :class:`pipeline.Pipeline` is now able to cache transformers + within a pipeline by using the ``memory`` constructor parameter. + :issue:`7990` by :user:`Guillaume Lemaitre `. + +- :class:`pipeline.Pipeline` steps can now be accessed as attributes of its + ``named_steps`` attribute. :issue:`8586` by :user:`Herilalaina + Rakotoarison `. + +- Added ``sample_weight`` parameter to :meth:`pipeline.Pipeline.score`. + :issue:`7723` by :user:`Mikhail Korobov `. + +- Added ability to set ``n_jobs`` parameter to :func:`pipeline.make_union`. + A ``TypeError`` will be raised for any other kwargs. :issue:`8028` + by :user:`Alexander Booth `. + +- :class:`model_selection.GridSearchCV`, + :class:`model_selection.RandomizedSearchCV` and + :func:`model_selection.cross_val_score` now allow estimators with callable + kernels which were previously prohibited. + :issue:`8005` by `Andreas Müller`_ . + +- :func:`model_selection.cross_val_predict` now returns output of the + correct shape for all values of the argument ``method``. + :issue:`7863` by :user:`Aman Dalmia `. + +- Added ``shuffle`` and ``random_state`` parameters to shuffle training + data before taking prefixes of it based on training sizes in + :func:`model_selection.learning_curve`. + :issue:`7506` by :user:`Narine Kokhlikyan `. + +- :class:`model_selection.StratifiedShuffleSplit` now works with multioutput + multiclass (or multilabel) data. :issue:`9044` by `Vlad Niculae`_. + +- Speed improvements to :class:`model_selection.StratifiedShuffleSplit`. + :issue:`5991` by :user:`Arthur Mensch ` and `Joel Nothman`_. + +- Add ``shuffle`` parameter to :func:`model_selection.train_test_split`. + :issue:`8845` by :user:`themrmax ` + +- :class:`multioutput.MultiOutputRegressor` and :class:`multioutput.MultiOutputClassifier` + now support online learning using ``partial_fit``. + :issue: `8053` by :user:`Peng Yu `. + +- Add ``max_train_size`` parameter to :class:`model_selection.TimeSeriesSplit` + :issue:`8282` by :user:`Aman Dalmia `. + +- More clustering metrics are now available through :func:`metrics.get_scorer` + and ``scoring`` parameters. :issue:`8117` by `Raghav RV`_. + +- A scorer based on :func:`metrics.explained_variance_score` is also available. + :issue:`9259` by :user:`Hanmin Qin `. + +Metrics + +- :func:`metrics.matthews_corrcoef` now support multiclass classification. + :issue:`8094` by :user:`Jon Crall `. + +- Add ``sample_weight`` parameter to :func:`metrics.cohen_kappa_score`. + :issue:`8335` by :user:`Victor Poughon `. + +Miscellaneous + +- :func:`utils.check_estimator` now attempts to ensure that methods + transform, predict, etc. do not set attributes on the estimator. + :issue:`7533` by :user:`Ekaterina Krivich `. + +- Added type checking to the ``accept_sparse`` parameter in + :mod:`utils.validation` methods. This parameter now accepts only boolean, + string, or list/tuple of strings. ``accept_sparse=None`` is deprecated and + should be replaced by ``accept_sparse=False``. + :issue:`7880` by :user:`Josh Karnofsky `. + +- Make it possible to load a chunk of an svmlight formatted file by + passing a range of bytes to :func:`datasets.load_svmlight_file`. + :issue:`935` by :user:`Olivier Grisel `. + +- :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor` + now accept non-finite features. :issue:`8931` by :user:`Attractadore`. + +Bug fixes +......... + +Trees and ensembles + +- Fixed a memory leak in trees when using trees with ``criterion='mae'``. + :issue:`8002` by `Raghav RV`_. + +- Fixed a bug where :class:`ensemble.IsolationForest` uses an + an incorrect formula for the average path length + :issue:`8549` by `Peter Wang `_. + +- Fixed a bug where :class:`ensemble.AdaBoostClassifier` throws + ``ZeroDivisionError`` while fitting data with single class labels. + :issue:`7501` by :user:`Dominik Krzeminski `. + +- Fixed a bug in :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` where a float being compared + to ``0.0`` using ``==`` caused a divide by zero error. :issue:`7970` by + :user:`He Chen `. + +- Fix a bug where :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor` ignored the + ``min_impurity_split`` parameter. + :issue:`8006` by :user:`Sebastian Pölsterl `. + +- Fixed ``oob_score`` in :class:`ensemble.BaggingClassifier`. + :issue:`8936` by :user:`Michael Lewis ` + +- Fixed excessive memory usage in prediction for random forests estimators. + :issue:`8672` by :user:`Mike Benfield `. + +- Fixed a bug where ``sample_weight`` as a list broke random forests in Python 2 + :issue:`8068` by :user:`xor`. + +- Fixed a bug where :class:`ensemble.IsolationForest` fails when + ``max_features`` is less than 1. + :issue:`5732` by :user:`Ishank Gulati `. + +- Fix a bug where gradient boosting with ``loss='quantile'`` computed + negative errors for negative values of ``ytrue - ypred`` leading to wrong + values when calling ``__call__``. + :issue:`8087` by :user:`Alexis Mignon ` + +- Fix a bug where :class:`ensemble.VotingClassifier` raises an error + when a numpy array is passed in for weights. :issue:`7983` by + :user:`Vincent Pham `. + +- Fixed a bug where :func:`tree.export_graphviz` raised an error + when the length of features_names does not match n_features in the decision + tree. :issue:`8512` by :user:`Li Li `. + +Linear, kernelized and related models + +- Fixed a bug where :func:`linear_model.RANSACRegressor.fit` may run until + ``max_iter`` if it finds a large inlier group early. :issue:`8251` by + :user:`aivision2020`. + +- Fixed a bug where :class:`naive_bayes.MultinomialNB` and + :class:`naive_bayes.BernoulliNB` failed when ``alpha=0``. :issue:`5814` by + :user:`Yichuan Liu ` and :user:`Herilalaina Rakotoarison + `. + +- Fixed a bug where :class:`linear_model.LassoLars` does not give + the same result as the LassoLars implementation available + in R (lars library). :issue:`7849` by :user:`Jair Montoya Martinez `. + +- Fixed a bug in :class:`linear_model.RandomizedLasso`, + :class:`linear_model.Lars`, :class:`linear_model.LassoLars`, + :class:`linear_model.LarsCV` and :class:`linear_model.LassoLarsCV`, + where the parameter ``precompute`` was not used consistently across + classes, and some values proposed in the docstring could raise errors. + :issue:`5359` by `Tom Dupre la Tour`_. + +- Fix inconsistent results between :class:`linear_model.RidgeCV` and + :class:`linear_model.Ridge` when using ``normalize=True``. :issue:`9302` + by `Alexandre Gramfort`_. + +- Fix a bug where :func:`linear_model.LassoLars.fit` sometimes + left ``coef_`` as a list, rather than an ndarray. + :issue:`8160` by :user:`CJ Carey `. + +- Fix :func:`linear_model.BayesianRidge.fit` to return + ridge parameter ``alpha_`` and ``lambda_`` consistent with calculated + coefficients ``coef_`` and ``intercept_``. + :issue:`8224` by :user:`Peter Gedeck `. + +- Fixed a bug in :class:`svm.OneClassSVM` where it returned floats instead of + integer classes. :issue:`8676` by :user:`Vathsala Achar `. + +- Fix AIC/BIC criterion computation in :class:`linear_model.LassoLarsIC`. + :issue:`9022` by `Alexandre Gramfort`_ and :user:`Mehmet Basbug `. + +- Fixed a memory leak in our LibLinear implementation. :issue:`9024` by + :user:`Sergei Lebedev ` + +- Fix bug where stratified CV splitters did not work with + :class:`linear_model.LassoCV`. :issue:`8973` by + :user:`Paulo Haddad `. + +- Fixed a bug in :class:`gaussian_process.GaussianProcessRegressor` + when the standard deviation and covariance predicted without fit + would fail with a unmeaningful error by default. + :issue:`6573` by :user:`Quazi Marufur Rahman ` and + `Manoj Kumar`_. + +Other predictors + +- Fix :class:`semi_supervised.BaseLabelPropagation` to correctly implement + ``LabelPropagation`` and ``LabelSpreading`` as done in the referenced + papers. :issue:`9239` + by :user:`Andre Ambrosio Boechat `, :user:`Utkarsh Upadhyay + `, and `Joel Nothman`_. + +Decomposition, manifold learning and clustering + +- Fixed the implementation of :class:`manifold.TSNE`: +- ``early_exageration`` parameter had no effect and is now used for the + first 250 optimization iterations. +- Fixed the ``AssertionError: Tree consistency failed`` exception + reported in :issue:`8992`. +- Improve the learning schedule to match the one from the reference + implementation `lvdmaaten/bhtsne `_. + by :user:`Thomas Moreau ` and `Olivier Grisel`_. + +- Fix a bug in :class:`decomposition.LatentDirichletAllocation` + where the ``perplexity`` method was returning incorrect results because + the ``transform`` method returns normalized document topic distributions + as of version 0.18. :issue:`7954` by :user:`Gary Foreman `. + +- Fix output shape and bugs with n_jobs > 1 in + :class:`decomposition.SparseCoder` transform and + :func:`decomposition.sparse_encode` + for one-dimensional data and one component. + This also impacts the output shape of :class:`decomposition.DictionaryLearning`. + :issue:`8086` by `Andreas Müller`_. + +- Fixed the implementation of ``explained_variance_`` + in :class:`decomposition.PCA`, + :class:`decomposition.RandomizedPCA` and + :class:`decomposition.IncrementalPCA`. + :issue:`9105` by `Hanmin Qin `_. + +- Fixed the implementation of noise_variance_ in :class:`decomposition.PCA`. + :issue:`9108` by `Hanmin Qin `_. + +- Fixed a bug where :class:`cluster.DBSCAN` gives incorrect + result when input is a precomputed sparse matrix with initial + rows all zero. :issue:`8306` by :user:`Akshay Gupta ` + +- Fix a bug regarding fitting :class:`cluster.KMeans` with a sparse + array X and initial centroids, where X's means were unnecessarily being + subtracted from the centroids. :issue:`7872` by :user:`Josh Karnofsky `. + +- Fixes to the input validation in :class:`covariance.EllipticEnvelope`. + :issue:`8086` by `Andreas Müller`_. + +- Fixed a bug in :class:`covariance.MinCovDet` where inputting data + that produced a singular covariance matrix would cause the helper method + ``_c_step`` to throw an exception. + :issue:`3367` by :user:`Jeremy Steward ` + +- Fixed a bug in :class:`manifold.TSNE` affecting convergence of the + gradient descent. :issue:`8768` by :user:`David DeTomaso `. + +- Fixed a bug in :class:`manifold.TSNE` where it stored the incorrect + ``kl_divergence_``. :issue:`6507` by :user:`Sebastian Saeger `. + +- Fixed improper scaling in :class:`cross_decomposition.PLSRegression` + with ``scale=True``. :issue:`7819` by :user:`jayzed82 `. + +- :class:`cluster.bicluster.SpectralCoclustering` and + :class:`cluster.bicluster.SpectralBiclustering` ``fit`` method conforms + with API by accepting ``y`` and returning the object. :issue:`6126`, + :issue:`7814` by :user:`Laurent Direr ` and :user:`Maniteja + Nandana `. + +- Fix bug where :mod:`mixture` ``sample`` methods did not return as many + samples as requested. :issue:`7702` by :user:`Levi John Wolf `. + +- Fixed the shrinkage implementation in :class:`neighbors.NearestCentroid`. + :issue:`9219` by `Hanmin Qin `_. + +Preprocessing and feature selection + +- For sparse matrices, :func:`preprocessing.normalize` with ``return_norm=True`` + will now raise a ``NotImplementedError`` with 'l1' or 'l2' norm and with + norm 'max' the norms returned will be the same as for dense matrices. + :issue:`7771` by `Ang Lu `_. + +- Fix a bug where :class:`feature_selection.SelectFdr` did not + exactly implement Benjamini-Hochberg procedure. It formerly may have + selected fewer features than it should. + :issue:`7490` by :user:`Peng Meng `. + +- Fixed a bug where :class:`linear_model.RandomizedLasso` and + :class:`linear_model.RandomizedLogisticRegression` breaks for + sparse input. :issue:`8259` by :user:`Aman Dalmia `. + +- Fix a bug where :class:`feature_extraction.FeatureHasher` + mandatorily applied a sparse random projection to the hashed features, + preventing the use of + :class:`feature_extraction.text.HashingVectorizer` in a + pipeline with :class:`feature_extraction.text.TfidfTransformer`. + :issue:`7565` by :user:`Roman Yurchak `. + +- Fix a bug where :class:`feature_selection.mutual_info_regression` did not + correctly use ``n_neighbors``. :issue:`8181` by :user:`Guillaume Lemaitre + `. + +Model evaluation and meta-estimators + +- Fixed a bug where :func:`model_selection.BaseSearchCV.inverse_transform` + returns ``self.best_estimator_.transform()`` instead of + ``self.best_estimator_.inverse_transform()``. + :issue:`8344` by :user:`Akshay Gupta ` and :user:`Rasmus Eriksson `. + +- Added ``classes_`` attribute to :class:`model_selection.GridSearchCV`, + :class:`model_selection.RandomizedSearchCV`, :class:`grid_search.GridSearchCV`, + and :class:`grid_search.RandomizedSearchCV` that matches the ``classes_`` + attribute of ``best_estimator_``. :issue:`7661` and :issue:`8295` + by :user:`Alyssa Batula `, :user:`Dylan Werner-Meier `, + and :user:`Stephen Hoover `. + +- Fixed a bug where :func:`model_selection.validation_curve` + reused the same estimator for each parameter value. + :issue:`7365` by :user:`Aleksandr Sandrovskii `. + +- :func:`model_selection.permutation_test_score` now works with Pandas + types. :issue:`5697` by :user:`Stijn Tonk `. + +- Several fixes to input validation in + :class:`multiclass.OutputCodeClassifier` + :issue:`8086` by `Andreas Müller`_. + +- :class:`multiclass.OneVsOneClassifier`'s ``partial_fit`` now ensures all + classes are provided up-front. :issue:`6250` by + :user:`Asish Panda `. + +- Fix :func:`multioutput.MultiOutputClassifier.predict_proba` to return a + list of 2d arrays, rather than a 3d array. In the case where different + target columns had different numbers of classes, a ``ValueError`` would be + raised on trying to stack matrices with different dimensions. + :issue:`8093` by :user:`Peter Bull `. + +- Cross validation now works with Pandas datatypes that that have a + read-only index. :issue:`9507` by `Loic Esteve`_. + +Metrics + +- :func:`metrics.average_precision_score` no longer linearly + interpolates between operating points, and instead weighs precisions + by the change in recall since the last operating point, as per the + `Wikipedia entry `_. + (`#7356 `_). By + :user:`Nick Dingwall ` and `Gael Varoquaux`_. + +- Fix a bug in :func:`metrics.classification._check_targets` + which would return ``'binary'`` if ``y_true`` and ``y_pred`` were + both ``'binary'`` but the union of ``y_true`` and ``y_pred`` was + ``'multiclass'``. :issue:`8377` by `Loic Esteve`_. + +- Fixed an integer overflow bug in :func:`metrics.confusion_matrix` and + hence :func:`metrics.cohen_kappa_score`. :issue:`8354`, :issue:`7929` + by `Joel Nothman`_ and :user:`Jon Crall `. + +- Fixed passing of ``gamma`` parameter to the ``chi2`` kernel in + :func:`metrics.pairwise.pairwise_kernels` :issue:`5211` by + :user:`Nick Rhinehart `, + :user:`Saurabh Bansod ` and `Andreas Müller`_. + +Miscellaneous + +- Fixed a bug when :func:`datasets.make_classification` fails + when generating more than 30 features. :issue:`8159` by + :user:`Herilalaina Rakotoarison `. + +- Fixed a bug where :func:`datasets.make_moons` gives an + incorrect result when ``n_samples`` is odd. + :issue:`8198` by :user:`Josh Levy `. + +- Some ``fetch_`` functions in :mod:`datasets` were ignoring the + ``download_if_missing`` keyword. :issue:`7944` by :user:`Ralf Gommers `. + +- Fix estimators to accept a ``sample_weight`` parameter of type + ``pandas.Series`` in their ``fit`` function. :issue:`7825` by + `Kathleen Chen`_. + +- Fix a bug in cases where ``numpy.cumsum`` may be numerically unstable, + raising an exception if instability is identified. :issue:`7376` and + :issue:`7331` by `Joel Nothman`_ and :user:`yangarbiter`. + +- Fix a bug where :meth:`base.BaseEstimator.__getstate__` + obstructed pickling customizations of child-classes, when used in a + multiple inheritance context. + :issue:`8316` by :user:`Holger Peters `. + +- Update Sphinx-Gallery from 0.1.4 to 0.1.7 for resolving links in + documentation build with Sphinx>1.5 :issue:`8010`, :issue:`7986` by + :user:`Oscar Najera ` + +- Add ``data_home`` parameter to :func:`sklearn.datasets.fetch_kddcup99`. + :issue:`9289` by `Loic Esteve`_. + +- Fix dataset loaders using Python 3 version of makedirs to also work in + Python 2. :issue:`9284` by :user:`Sebastin Santy `. + +- Several minor issues were fixed with thanks to the alerts of + [lgtm.com](http://lgtm.com). :issue:`9278` by :user:`Jean Helie `, + among others. + +API changes summary +------------------- + +Trees and ensembles + +- Gradient boosting base models are no longer estimators. By `Andreas Müller`_. + +- All tree based estimators now accept a ``min_impurity_decrease`` + parameter in lieu of the ``min_impurity_split``, which is now deprecated. + The ``min_impurity_decrease`` helps stop splitting the nodes in which + the weighted impurity decrease from splitting is no longer alteast + ``min_impurity_decrease``. :issue:`8449` by `Raghav RV`_. + +Linear, kernelized and related models + +- ``n_iter`` parameter is deprecated in :class:`linear_model.SGDClassifier`, + :class:`linear_model.SGDRegressor`, + :class:`linear_model.PassiveAggressiveClassifier`, + :class:`linear_model.PassiveAggressiveRegressor` and + :class:`linear_model.Perceptron`. By `Tom Dupre la Tour`_. + +Other predictors + +- :class:`neighbors.LSHForest` has been deprecated and will be + removed in 0.21 due to poor performance. + :issue:`9078` by :user:`Laurent Direr `. + +- :class:`neighbors.NearestCentroid` no longer purports to support + ``metric='precomputed'`` which now raises an error. :issue:`8515` by + :user:`Sergul Aydore `. + +- The ``alpha`` parameter of :class:`semi_supervised.LabelPropagation` now + has no effect and is deprecated to be removed in 0.21. :issue:`9239` + by :user:`Andre Ambrosio Boechat `, :user:`Utkarsh Upadhyay + `, and `Joel Nothman`_. + +Decomposition, manifold learning and clustering + +- Deprecate the ``doc_topic_distr`` argument of the ``perplexity`` method + in :class:`decomposition.LatentDirichletAllocation` because the + user no longer has access to the unnormalized document topic distribution + needed for the perplexity calculation. :issue:`7954` by + :user:`Gary Foreman `. + +- The ``n_topics`` parameter of :class:`decomposition.LatentDirichletAllocation` + has been renamed to ``n_components`` and will be removed in version 0.21. + :issue:`8922` by :user:`Attractadore`. + +- :meth:`decomposition.SparsePCA.transform`'s ``ridge_alpha`` parameter is + deprecated in preference for class parameter. + :issue:`8137` by :user:`Naoya Kanai `. + +- :class:`cluster.DBSCAN` now has a ``metric_params`` parameter. + :issue:`8139` by :user:`Naoya Kanai `. + +Preprocessing and feature selection + +- :class:`feature_selection.SelectFromModel` now has a ``partial_fit`` + method only if the underlying estimator does. By `Andreas Müller`_. + +- :class:`feature_selection.SelectFromModel` now validates the ``threshold`` + parameter and sets the ``threshold_`` attribute during the call to + ``fit``, and no longer during the call to ``transform```. By `Andreas + Müller`_. + +- The ``non_negative`` parameter in :class:`feature_extraction.FeatureHasher` + has been deprecated, and replaced with a more principled alternative, + ``alternate_sign``. + :issue:`7565` by :user:`Roman Yurchak `. + +- :class:`linear_model.RandomizedLogisticRegression`, + and :class:`linear_model.RandomizedLasso` have been deprecated and will + be removed in version 0.21. + :issue:`8995` by :user:`Ramana.S `. + +Model evaluation and meta-estimators + +- Deprecate the ``fit_params`` constructor input to the + :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` in favor + of passing keyword parameters to the ``fit`` methods + of those classes. Data-dependent parameters needed for model + training should be passed as keyword arguments to ``fit``, + and conforming to this convention will allow the hyperparameter + selection classes to be used with tools such as + :func:`model_selection.cross_val_predict`. + :issue:`2879` by :user:`Stephen Hoover `. + +- In version 0.21, the default behavior of splitters that use the + ``test_size`` and ``train_size`` parameter will change, such that + specifying ``train_size`` alone will cause ``test_size`` to be the + remainder. :issue:`7459` by :user:`Nelson Liu `. + +- :class:`multiclass.OneVsRestClassifier` now has ``partial_fit``, + ``decision_function`` and ``predict_proba`` methods only when the + underlying estimator does. :issue:`7812` by `Andreas Müller`_ and + :user:`Mikhail Korobov `. + +- :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method + only if the underlying estimator does. By `Andreas Müller`_. + +- The ``decision_function`` output shape for binary classification in + :class:`multiclass.OneVsRestClassifier` and + :class:`multiclass.OneVsOneClassifier` is now ``(n_samples,)`` to conform + to scikit-learn conventions. :issue:`9100` by `Andreas Müller`_. + +- The :func:`multioutput.MultiOutputClassifier.predict_proba` + function used to return a 3d array (``n_samples``, ``n_classes``, + ``n_outputs``). In the case where different target columns had different + numbers of classes, a ``ValueError`` would be raised on trying to stack + matrices with different dimensions. This function now returns a list of + arrays where the length of the list is ``n_outputs``, and each array is + (``n_samples``, ``n_classes``) for that particular output. + :issue:`8093` by :user:`Peter Bull `. + +- Replace attribute ``named_steps`` ``dict`` to :class:`utils.Bunch` + in :class:`pipeline.Pipeline` to enable tab completion in interactive + environment. In the case conflict value on ``named_steps`` and ``dict`` + attribute, ``dict`` behavior will be prioritized. + :issue:`8481` by :user:`Herilalaina Rakotoarison `. + +Miscellaneous + +- Deprecate the ``y`` parameter in ``transform`` and ``inverse_transform``. + The method should not accept ``y`` parameter, as it's used at the prediction time. + :issue:`8174` by :user:`Tahar Zanouda `, `Alexandre Gramfort`_ + and `Raghav RV`_. + +- SciPy >= 0.13.3 and NumPy >= 1.8.2 are now the minimum supported versions + for scikit-learn. The following backported functions in + :mod:`utils` have been removed or deprecated accordingly. + :issue:`8854` and :issue:`8874` by :user:`Naoya Kanai ` + +- The ``store_covariances`` and ``covariances_`` parameters of + :class:`discriminant_analysis.QuadraticDiscriminantAnalysis` + has been renamed to ``store_covariance`` and ``covariance_`` to be + consistent with the corresponding parameter names of the + :class:`discriminant_analysis.LinearDiscriminantAnalysis`. They will be + removed in version 0.21. :issue:`7998` by :user:`Jiacheng ` + + Removed in 0.19: + + - ``utils.fixes.argpartition`` + - ``utils.fixes.array_equal`` + - ``utils.fixes.astype`` + - ``utils.fixes.bincount`` + - ``utils.fixes.expit`` + - ``utils.fixes.frombuffer_empty`` + - ``utils.fixes.in1d`` + - ``utils.fixes.norm`` + - ``utils.fixes.rankdata`` + - ``utils.fixes.safe_copy`` + + Deprecated in 0.19, to be removed in 0.21: + + - ``utils.arpack.eigs`` + - ``utils.arpack.eigsh`` + - ``utils.arpack.svds`` + - ``utils.extmath.fast_dot`` + - ``utils.extmath.logsumexp`` + - ``utils.extmath.norm`` + - ``utils.extmath.pinvh`` + - ``utils.graph.graph_laplacian`` + - ``utils.random.choice`` + - ``utils.sparsetools.connected_components`` + - ``utils.stats.rankdata`` + +- Estimators with both methods ``decision_function`` and ``predict_proba`` + are now required to have a monotonic relation between them. The + method ``check_decision_proba_consistency`` has been added in + **utils.estimator_checks** to check their consistency. + :issue:`7578` by :user:`Shubham Bhardwaj ` + +- All checks in ``utils.estimator_checks``, in particular + :func:`utils.estimator_checks.check_estimator` now accept estimator + instances. Most other checks do not accept + estimator classes any more. :issue:`9019` by `Andreas Müller`_. + +- Ensure that estimators' attributes ending with ``_`` are not set + in the constructor but only in the ``fit`` method. Most notably, + ensemble estimators (deriving from :class:`ensemble.BaseEnsemble`) + now only have ``self.estimators_`` available after ``fit``. + :issue:`7464` by `Lars Buitinck`_ and `Loic Esteve`_. + + +Code and Documentation Contributors +----------------------------------- + +Thanks to everyone who has contributed to the maintenance and improvement of the +project since version 0.18, including: + +Joel Nothman, Loic Esteve, Andreas Mueller, Guillaume Lemaitre, Olivier Grisel, +Hanmin Qin, Raghav RV, Alexandre Gramfort, themrmax, Aman Dalmia, Gael +Varoquaux, Naoya Kanai, Tom Dupré la Tour, Rishikesh, Nelson Liu, Taehoon Lee, +Nelle Varoquaux, Aashil, Mikhail Korobov, Sebastin Santy, Joan Massich, Roman +Yurchak, RAKOTOARISON Herilalaina, Thierry Guillemot, Alexandre Abadie, Carol +Willing, Balakumaran Manoharan, Josh Karnofsky, Vlad Niculae, Utkarsh Upadhyay, +Dmitry Petrov, Minghui Liu, Srivatsan, Vincent Pham, Albert Thomas, Jake +VanderPlas, Attractadore, JC Liu, alexandercbooth, chkoar, Óscar Nájera, +Aarshay Jain, Kyle Gilliam, Ramana Subramanyam, CJ Carey, Clement Joudet, David +Robles, He Chen, Joris Van den Bossche, Karan Desai, Katie Luangkote, Leland +McInnes, Maniteja Nandana, Michele Lacchia, Sergei Lebedev, Shubham Bhardwaj, +akshay0724, omtcyfz, rickiepark, waterponey, Vathsala Achar, jbDelafosse, Ralf +Gommers, Ekaterina Krivich, Vivek Kumar, Ishank Gulati, Dave Elliott, ldirer, +Reiichiro Nakano, Levi John Wolf, Mathieu Blondel, Sid Kapur, Dougal J. +Sutherland, midinas, mikebenfield, Sourav Singh, Aseem Bansal, Ibraim Ganiev, +Stephen Hoover, AishwaryaRK, Steven C. Howell, Gary Foreman, Neeraj Gangwar, +Tahar, Jon Crall, dokato, Kathy Chen, ferria, Thomas Moreau, Charlie Brummitt, +Nicolas Goix, Adam Kleczewski, Sam Shleifer, Nikita Singh, Basil Beirouti, +Giorgio Patrini, Manoj Kumar, Rafael Possas, James Bourbeau, James A. Bednar, +Janine Harper, Jaye, Jean Helie, Jeremy Steward, Artsiom, John Wei, Jonathan +LIgo, Jonathan Rahn, seanpwilliams, Arthur Mensch, Josh Levy, Julian Kuhlmann, +Julien Aubert, Jörn Hees, Kai, shivamgargsya, Kat Hempstalk, Kaushik +Lakshmikanth, Kennedy, Kenneth Lyons, Kenneth Myers, Kevin Yap, Kirill Bobyrev, +Konstantin Podshumok, Arthur Imbert, Lee Murray, toastedcornflakes, Lera, Li +Li, Arthur Douillard, Mainak Jas, tobycheese, Manraj Singh, Manvendra Singh, +Marc Meketon, MarcoFalke, Matthew Brett, Matthias Gilch, Mehul Ahuja, Melanie +Goetz, Meng, Peng, Michael Dezube, Michal Baumgartner, vibrantabhi19, Artem +Golubin, Milen Paskov, Antonin Carette, Morikko, MrMjauh, NALEPA Emmanuel, +Namiya, Antoine Wendlinger, Narine Kokhlikyan, NarineK, Nate Guerin, Angus +Williams, Ang Lu, Nicole Vavrova, Nitish Pandey, Okhlopkov Daniil Olegovich, +Andy Craze, Om Prakash, Parminder Singh, Patrick Carlson, Patrick Pei, Paul +Ganssle, Paulo Haddad, Paweł Lorek, Peng Yu, Pete Bachant, Peter Bull, Peter +Csizsek, Peter Wang, Pieter Arthur de Jong, Ping-Yao, Chang, Preston Parry, +Puneet Mathur, Quentin Hibon, Andrew Smith, Andrew Jackson, 1kastner, Rameshwar +Bhaskaran, Rebecca Bilbro, Remi Rampin, Andrea Esuli, Rob Hall, Robert +Bradshaw, Romain Brault, Aman Pratik, Ruifeng Zheng, Russell Smith, Sachin +Agarwal, Sailesh Choyal, Samson Tan, Samuël Weber, Sarah Brown, Sebastian +Pölsterl, Sebastian Raschka, Sebastian Saeger, Alyssa Batula, Abhyuday Pratap +Singh, Sergey Feldman, Sergul Aydore, Sharan Yalburgi, willduan, Siddharth +Gupta, Sri Krishna, Almer, Stijn Tonk, Allen Riddell, Theofilos Papapanagiotou, +Alison, Alexis Mignon, Tommy Boucher, Tommy Löfstedt, Toshihiro Kamishima, +Tyler Folkman, Tyler Lanigan, Alexander Junge, Varun Shenoy, Victor Poughon, +Vilhelm von Ehrenheim, Aleksandr Sandrovskii, Alan Yee, Vlasios Vasileiou, +Warut Vijitbenjaronk, Yang Zhang, Yaroslav Halchenko, Yichuan Liu, Yuichi +Fujikawa, affanv14, aivision2020, xor, andreh7, brady salz, campustrampus, +Agamemnon Krasoulis, ditenberg, elena-sharova, filipj8, fukatani, gedeck, +guiniol, guoci, hakaa1, hongkahjun, i-am-xhy, jakirkham, jaroslaw-weber, +jayzed82, jeroko, jmontoyam, jonathan.striebel, josephsalmon, jschendel, +leereeves, martin-hahn, mathurinm, mehak-sachdeva, mlewis1729, mlliou112, +mthorrell, ndingwall, nuffe, yangarbiter, plagree, pldtc325, Breno Freitas, +Brett Olsen, Brian A. Alfano, Brian Burns, polmauri, Brandon Carter, Charlton +Austin, Chayant T15h, Chinmaya Pancholi, Christian Danielsen, Chung Yen, +Chyi-Kwei Yau, pravarmahajan, DOHMATOB Elvis, Daniel LeJeune, Daniel Hnyk, +Darius Morawiec, David DeTomaso, David Gasquez, David Haberthür, David +Heryanto, David Kirkby, David Nicholson, rashchedrin, Deborah Gertrude Digges, +Denis Engemann, Devansh D, Dickson, Bob Baxley, Don86, E. Lynch-Klarup, Ed +Rogers, Elizabeth Ferriss, Ellen-Co2, Fabian Egli, Fang-Chieh Chou, Bing Tian +Dai, Greg Stupp, Grzegorz Szpak, Bertrand Thirion, Hadrien Bertrand, Harizo +Rajaona, zxcvbnius, Henry Lin, Holger Peters, Icyblade Dai, Igor +Andriushchenko, Ilya, Isaac Laughlin, Iván Vallés, Aurélien Bellet, JPFrancoia, +Jacob Schreiber, Asish Mahapatra + +.. _changes_0_18_2: + +Version 0.18.2 +============== + +**June 20, 2017** + +.. topic:: Last release with Python 2.6 support + + Scikit-learn 0.18 is the last major release of scikit-learn to support Python 2.6. + Later versions of scikit-learn will require Python 2.7 or above. + + +Changelog +--------- + +- Fixes for compatibility with NumPy 1.13.0: :issue:`7946` :issue:`8355` by + `Loic Esteve`_. + +- Minor compatibility changes in the examples :issue:`9010` :issue:`8040` + :issue:`9149`. + +Code Contributors +----------------- +Aman Dalmia, Loic Esteve, Nate Guerin, Sergei Lebedev + + +.. _changes_0_18_1: + +Version 0.18.1 +============== + +**November 11, 2016** + +Changelog +--------- + +Enhancements +............ + +- Improved ``sample_without_replacement`` speed by utilizing + numpy.random.permutation for most cases. As a result, + samples may differ in this release for a fixed random state. + Affected estimators: + + - :class:`ensemble.BaggingClassifier` + - :class:`ensemble.BaggingRegressor` + - :class:`linear_model.RANSACRegressor` + - :class:`model_selection.RandomizedSearchCV` + - :class:`random_projection.SparseRandomProjection` + + This also affects the :meth:`datasets.make_classification` + method. + +Bug fixes +......... + +- Fix issue where ``min_grad_norm`` and ``n_iter_without_progress`` + parameters were not being utilised by :class:`manifold.TSNE`. + :issue:`6497` by :user:`Sebastian Säger ` + +- Fix bug for svm's decision values when ``decision_function_shape`` + is ``ovr`` in :class:`svm.SVC`. + :class:`svm.SVC`'s decision_function was incorrect from versions + 0.17.0 through 0.18.0. + :issue:`7724` by `Bing Tian Dai`_ + +- Attribute ``explained_variance_ratio`` of + :class:`discriminant_analysis.LinearDiscriminantAnalysis` calculated + with SVD and Eigen solver are now of the same length. :issue:`7632` + by :user:`JPFrancoia ` + +- Fixes issue in :ref:`univariate_feature_selection` where score + functions were not accepting multi-label targets. :issue:`7676` + by :user:`Mohammed Affan ` + +- Fixed setting parameters when calling ``fit`` multiple times on + :class:`feature_selection.SelectFromModel`. :issue:`7756` by `Andreas Müller`_ + +- Fixes issue in ``partial_fit`` method of + :class:`multiclass.OneVsRestClassifier` when number of classes used in + ``partial_fit`` was less than the total number of classes in the + data. :issue:`7786` by `Srivatsan Ramesh`_ + +- Fixes issue in :class:`calibration.CalibratedClassifierCV` where + the sum of probabilities of each class for a data was not 1, and + ``CalibratedClassifierCV`` now handles the case where the training set + has less number of classes than the total data. :issue:`7799` by + `Srivatsan Ramesh`_ + +- Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not + exactly implement Benjamini-Hochberg procedure. It formerly may have + selected fewer features than it should. + :issue:`7490` by :user:`Peng Meng `. + +- :class:`sklearn.manifold.LocallyLinearEmbedding` now correctly handles + integer inputs. :issue:`6282` by `Jake Vanderplas`_. + +- The ``min_weight_fraction_leaf`` parameter of tree-based classifiers and + regressors now assumes uniform sample weights by default if the + ``sample_weight`` argument is not passed to the ``fit`` function. + Previously, the parameter was silently ignored. :issue:`7301` + by :user:`Nelson Liu `. + +- Numerical issue with :class:`linear_model.RidgeCV` on centered data when + `n_features > n_samples`. :issue:`6178` by `Bertrand Thirion`_ + +- Tree splitting criterion classes' cloning/pickling is now memory safe + :issue:`7680` by :user:`Ibraim Ganiev `. + +- Fixed a bug where :class:`decomposition.NMF` sets its ``n_iters_`` + attribute in `transform()`. :issue:`7553` by :user:`Ekaterina + Krivich `. + +- :class:`sklearn.linear_model.LogisticRegressionCV` now correctly handles + string labels. :issue:`5874` by `Raghav RV`_. + +- Fixed a bug where :func:`sklearn.model_selection.train_test_split` raised + an error when ``stratify`` is a list of string labels. :issue:`7593` by + `Raghav RV`_. + +- Fixed a bug where :class:`sklearn.model_selection.GridSearchCV` and + :class:`sklearn.model_selection.RandomizedSearchCV` were not pickleable + because of a pickling bug in ``np.ma.MaskedArray``. :issue:`7594` by + `Raghav RV`_. + +- All cross-validation utilities in :mod:`sklearn.model_selection` now + permit one time cross-validation splitters for the ``cv`` parameter. Also + non-deterministic cross-validation splitters (where multiple calls to + ``split`` produce dissimilar splits) can be used as ``cv`` parameter. + The :class:`sklearn.model_selection.GridSearchCV` will cross-validate each + parameter setting on the split produced by the first ``split`` call + to the cross-validation splitter. :issue:`7660` by `Raghav RV`_. + +- Fix bug where :meth:`preprocessing.MultiLabelBinarizer.fit_transform` + returned an invalid CSR matrix. + :issue:`7750` by :user:`CJ Carey `. + +- Fixed a bug where :func:`metrics.pairwise.cosine_distances` could return a + small negative distance. :issue:`7732` by :user:`Artsion `. + +API changes summary +------------------- + +Trees and forests + +- The ``min_weight_fraction_leaf`` parameter of tree-based classifiers and + regressors now assumes uniform sample weights by default if the + ``sample_weight`` argument is not passed to the ``fit`` function. + Previously, the parameter was silently ignored. :issue:`7301` by :user:`Nelson + Liu `. + +- Tree splitting criterion classes' cloning/pickling is now memory safe. + :issue:`7680` by :user:`Ibraim Ganiev `. + + +Linear, kernelized and related models + +- Length of ``explained_variance_ratio`` of + :class:`discriminant_analysis.LinearDiscriminantAnalysis` + changed for both Eigen and SVD solvers. The attribute has now a length + of min(n_components, n_classes - 1). :issue:`7632` + by :user:`JPFrancoia ` + +- Numerical issue with :class:`linear_model.RidgeCV` on centered data when + ``n_features > n_samples``. :issue:`6178` by `Bertrand Thirion`_ + +.. _changes_0_18: + +Version 0.18 +============ + +**September 28, 2016** + +.. topic:: Last release with Python 2.6 support + + Scikit-learn 0.18 will be the last version of scikit-learn to support Python 2.6. + Later versions of scikit-learn will require Python 2.7 or above. + +.. _model_selection_changes: + +Model Selection Enhancements and API Changes +-------------------------------------------- + +- **The model_selection module** + + The new module :mod:`sklearn.model_selection`, which groups together the + functionalities of formerly :mod:`sklearn.cross_validation`, + :mod:`sklearn.grid_search` and :mod:`sklearn.learning_curve`, introduces new + possibilities such as nested cross-validation and better manipulation of + parameter searches with Pandas. + + Many things will stay the same but there are some key differences. Read + below to know more about the changes. + +- **Data-independent CV splitters enabling nested cross-validation** + + The new cross-validation splitters, defined in the + :mod:`sklearn.model_selection`, are no longer initialized with any + data-dependent parameters such as ``y``. Instead they expose a + :func:`split` method that takes in the data and yields a generator for the + different splits. + + This change makes it possible to use the cross-validation splitters to + perform nested cross-validation, facilitated by + :class:`model_selection.GridSearchCV` and + :class:`model_selection.RandomizedSearchCV` utilities. + +- **The enhanced cv_results_ attribute** + + The new ``cv_results_`` attribute (of :class:`model_selection.GridSearchCV` + and :class:`model_selection.RandomizedSearchCV`) introduced in lieu of the + ``grid_scores_`` attribute is a dict of 1D arrays with elements in each + array corresponding to the parameter settings (i.e. search candidates). + + The ``cv_results_`` dict can be easily imported into ``pandas`` as a + ``DataFrame`` for exploring the search results. + + The ``cv_results_`` arrays include scores for each cross-validation split + (with keys such as ``'split0_test_score'``), as well as their mean + (``'mean_test_score'``) and standard deviation (``'std_test_score'``). + + The ranks for the search candidates (based on their mean + cross-validation score) is available at ``cv_results_['rank_test_score']``. + + The parameter values for each parameter is stored separately as numpy + masked object arrays. The value, for that search candidate, is masked if + the corresponding parameter is not applicable. Additionally a list of all + the parameter dicts are stored at ``cv_results_['params']``. + +- **Parameters n_folds and n_iter renamed to n_splits** + + Some parameter names have changed: + The ``n_folds`` parameter in new :class:`model_selection.KFold`, + :class:`model_selection.GroupKFold` (see below for the name change), + and :class:`model_selection.StratifiedKFold` is now renamed to + ``n_splits``. The ``n_iter`` parameter in + :class:`model_selection.ShuffleSplit`, the new class + :class:`model_selection.GroupShuffleSplit` and + :class:`model_selection.StratifiedShuffleSplit` is now renamed to + ``n_splits``. + +- **Rename of splitter classes which accepts group labels along with data** + + The cross-validation splitters ``LabelKFold``, + ``LabelShuffleSplit``, ``LeaveOneLabelOut`` and ``LeavePLabelOut`` have + been renamed to :class:`model_selection.GroupKFold`, + :class:`model_selection.GroupShuffleSplit`, + :class:`model_selection.LeaveOneGroupOut` and + :class:`model_selection.LeavePGroupsOut` respectively. + + Note the change from singular to plural form in + :class:`model_selection.LeavePGroupsOut`. + +- **Fit parameter labels renamed to groups** + + The ``labels`` parameter in the :func:`split` method of the newly renamed + splitters :class:`model_selection.GroupKFold`, + :class:`model_selection.LeaveOneGroupOut`, + :class:`model_selection.LeavePGroupsOut`, + :class:`model_selection.GroupShuffleSplit` is renamed to ``groups`` + following the new nomenclature of their class names. + +- **Parameter n_labels renamed to n_groups** + + The parameter ``n_labels`` in the newly renamed + :class:`model_selection.LeavePGroupsOut` is changed to ``n_groups``. + +- Training scores and Timing information + + ``cv_results_`` also includes the training scores for each + cross-validation split (with keys such as ``'split0_train_score'``), as + well as their mean (``'mean_train_score'``) and standard deviation + (``'std_train_score'``). To avoid the cost of evaluating training score, + set ``return_train_score=False``. + + Additionally the mean and standard deviation of the times taken to split, + train and score the model across all the cross-validation splits is + available at the key ``'mean_time'`` and ``'std_time'`` respectively. + +Changelog +--------- + +New features +............ + +Classifiers and Regressors + +- The Gaussian Process module has been reimplemented and now offers classification + and regression estimators through :class:`gaussian_process.GaussianProcessClassifier` + and :class:`gaussian_process.GaussianProcessRegressor`. Among other things, the new + implementation supports kernel engineering, gradient-based hyperparameter optimization or + sampling of functions from GP prior and GP posterior. Extensive documentation and + examples are provided. By `Jan Hendrik Metzen`_. + +- Added new supervised learning algorithm: :ref:`Multi-layer Perceptron ` + :issue:`3204` by :user:`Issam H. Laradji ` + +- Added :class:`linear_model.HuberRegressor`, a linear model robust to outliers. + :issue:`5291` by `Manoj Kumar`_. + +- Added the :class:`multioutput.MultiOutputRegressor` meta-estimator. It + converts single output regressors to multi-output regressors by fitting + one regressor per output. By :user:`Tim Head `. + +Other estimators + +- New :class:`mixture.GaussianMixture` and :class:`mixture.BayesianGaussianMixture` + replace former mixture models, employing faster inference + for sounder results. :issue:`7295` by :user:`Wei Xue ` and + :user:`Thierry Guillemot `. + +- Class :class:`decomposition.RandomizedPCA` is now factored into :class:`decomposition.PCA` + and it is available calling with parameter ``svd_solver='randomized'``. + The default number of ``n_iter`` for ``'randomized'`` has changed to 4. The old + behavior of PCA is recovered by ``svd_solver='full'``. An additional solver + calls ``arpack`` and performs truncated (non-randomized) SVD. By default, + the best solver is selected depending on the size of the input and the + number of components requested. :issue:`5299` by :user:`Giorgio Patrini `. + +- Added two functions for mutual information estimation: + :func:`feature_selection.mutual_info_classif` and + :func:`feature_selection.mutual_info_regression`. These functions can be + used in :class:`feature_selection.SelectKBest` and + :class:`feature_selection.SelectPercentile` as score functions. + By :user:`Andrea Bravi ` and :user:`Nikolay Mayorov `. + +- Added the :class:`ensemble.IsolationForest` class for anomaly detection based on + random forests. By `Nicolas Goix`_. + +- Added ``algorithm="elkan"`` to :class:`cluster.KMeans` implementing + Elkan's fast K-Means algorithm. By `Andreas Müller`_. + +Model selection and evaluation + +- Added :func:`metrics.cluster.fowlkes_mallows_score`, the Fowlkes Mallows + Index which measures the similarity of two clusterings of a set of points + By :user:`Arnaud Fouchet ` and :user:`Thierry Guillemot `. + +- Added :func:`metrics.calinski_harabaz_score`, which computes the Calinski + and Harabaz score to evaluate the resulting clustering of a set of points. + By :user:`Arnaud Fouchet ` and :user:`Thierry Guillemot `. + +- Added new cross-validation splitter + :class:`model_selection.TimeSeriesSplit` to handle time series data. + :issue:`6586` by :user:`YenChen Lin ` + +- The cross-validation iterators are replaced by cross-validation splitters + available from :mod:`sklearn.model_selection`, allowing for nested + cross-validation. See :ref:`model_selection_changes` for more information. + :issue:`4294` by `Raghav RV`_. + +Enhancements +............ + +Trees and ensembles + +- Added a new splitting criterion for :class:`tree.DecisionTreeRegressor`, + the mean absolute error. This criterion can also be used in + :class:`ensemble.ExtraTreesRegressor`, + :class:`ensemble.RandomForestRegressor`, and the gradient boosting + estimators. :issue:`6667` by :user:`Nelson Liu `. + +- Added weighted impurity-based early stopping criterion for decision tree + growth. :issue:`6954` by :user:`Nelson Liu ` + +- The random forest, extra tree and decision tree estimators now has a + method ``decision_path`` which returns the decision path of samples in + the tree. By `Arnaud Joly`_. + +- A new example has been added unveiling the decision tree structure. + By `Arnaud Joly`_. + +- Random forest, extra trees, decision trees and gradient boosting estimator + accept the parameter ``min_samples_split`` and ``min_samples_leaf`` + provided as a percentage of the training samples. By :user:`yelite ` and `Arnaud Joly`_. + +- Gradient boosting estimators accept the parameter ``criterion`` to specify + to splitting criterion used in built decision trees. + :issue:`6667` by :user:`Nelson Liu `. + +- The memory footprint is reduced (sometimes greatly) for + :class:`ensemble.bagging.BaseBagging` and classes that inherit from it, + i.e, :class:`ensemble.BaggingClassifier`, + :class:`ensemble.BaggingRegressor`, and :class:`ensemble.IsolationForest`, + by dynamically generating attribute ``estimators_samples_`` only when it is + needed. By :user:`David Staub `. + +- Added ``n_jobs`` and ``sample_weight`` parameters for + :class:`ensemble.VotingClassifier` to fit underlying estimators in parallel. + :issue:`5805` by :user:`Ibraim Ganiev `. + +Linear, kernelized and related models + +- In :class:`linear_model.LogisticRegression`, the SAG solver is now + available in the multinomial case. :issue:`5251` by `Tom Dupre la Tour`_. + +- :class:`linear_model.RANSACRegressor`, :class:`svm.LinearSVC` and + :class:`svm.LinearSVR` now support ``sample_weight``. + By :user:`Imaculate `. + +- Add parameter ``loss`` to :class:`linear_model.RANSACRegressor` to measure the + error on the samples for every trial. By `Manoj Kumar`_. + +- Prediction of out-of-sample events with Isotonic Regression + (:class:`isotonic.IsotonicRegression`) is now much faster (over 1000x in tests with synthetic + data). By :user:`Jonathan Arfa `. + +- Isotonic regression (:class:`isotonic.IsotonicRegression`) now uses a better algorithm to avoid + `O(n^2)` behavior in pathological cases, and is also generally faster + (:issue:`#6691`). By `Antony Lee`_. + +- :class:`naive_bayes.GaussianNB` now accepts data-independent class-priors + through the parameter ``priors``. By :user:`Guillaume Lemaitre `. + +- :class:`linear_model.ElasticNet` and :class:`linear_model.Lasso` + now works with ``np.float32`` input data without converting it + into ``np.float64``. This allows to reduce the memory + consumption. :issue:`6913` by :user:`YenChen Lin `. + +- :class:`semi_supervised.LabelPropagation` and :class:`semi_supervised.LabelSpreading` + now accept arbitrary kernel functions in addition to strings ``knn`` and ``rbf``. + :issue:`5762` by :user:`Utkarsh Upadhyay `. + +Decomposition, manifold learning and clustering + +- Added ``inverse_transform`` function to :class:`decomposition.NMF` to compute + data matrix of original shape. By :user:`Anish Shah `. + +- :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans` now works + with ``np.float32`` and ``np.float64`` input data without converting it. + This allows to reduce the memory consumption by using ``np.float32``. + :issue:`6846` by :user:`Sebastian Säger ` and + :user:`YenChen Lin `. + +Preprocessing and feature selection + +- :class:`preprocessing.RobustScaler` now accepts ``quantile_range`` parameter. + :issue:`5929` by :user:`Konstantin Podshumok `. + +- :class:`feature_extraction.FeatureHasher` now accepts string values. + :issue:`6173` by :user:`Ryad Zenine ` and + :user:`Devashish Deshpande `. + +- Keyword arguments can now be supplied to ``func`` in + :class:`preprocessing.FunctionTransformer` by means of the ``kw_args`` + parameter. By `Brian McFee`_. + +- :class:`feature_selection.SelectKBest` and :class:`feature_selection.SelectPercentile` + now accept score functions that take X, y as input and return only the scores. + By :user:`Nikolay Mayorov `. + +Model evaluation and meta-estimators + +- :class:`multiclass.OneVsOneClassifier` and :class:`multiclass.OneVsRestClassifier` + now support ``partial_fit``. By :user:`Asish Panda ` and + :user:`Philipp Dowling `. + +- Added support for substituting or disabling :class:`pipeline.Pipeline` + and :class:`pipeline.FeatureUnion` components using the ``set_params`` + interface that powers :mod:`sklearn.grid_search`. + See :ref:`sphx_glr_auto_examples_plot_compare_reduction.py` + By `Joel Nothman`_ and :user:`Robert McGibbon `. + +- The new ``cv_results_`` attribute of :class:`model_selection.GridSearchCV` + (and :class:`model_selection.RandomizedSearchCV`) can be easily imported + into pandas as a ``DataFrame``. Ref :ref:`model_selection_changes` for + more information. :issue:`6697` by `Raghav RV`_. + +- Generalization of :func:`model_selection.cross_val_predict`. + One can pass method names such as `predict_proba` to be used in the cross + validation framework instead of the default `predict`. + By :user:`Ori Ziv ` and :user:`Sears Merritt `. + +- The training scores and time taken for training followed by scoring for + each search candidate are now available at the ``cv_results_`` dict. + See :ref:`model_selection_changes` for more information. + :issue:`7325` by :user:`Eugene Chen ` and `Raghav RV`_. + +Metrics + +- Added ``labels`` flag to :class:`metrics.log_loss` to explicitly provide + the labels when the number of classes in ``y_true`` and ``y_pred`` differ. + :issue:`7239` by :user:`Hong Guangguo ` with help from + :user:`Mads Jensen ` and :user:`Nelson Liu `. + +- Support sparse contingency matrices in cluster evaluation + (:mod:`metrics.cluster.supervised`) to scale to a large number of + clusters. + :issue:`7419` by :user:`Gregory Stupp ` and `Joel Nothman`_. + +- Add ``sample_weight`` parameter to :func:`metrics.matthews_corrcoef`. + By :user:`Jatin Shah ` and `Raghav RV`_. + +- Speed up :func:`metrics.silhouette_score` by using vectorized operations. + By `Manoj Kumar`_. + +- Add ``sample_weight`` parameter to :func:`metrics.confusion_matrix`. + By :user:`Bernardo Stein `. + +Miscellaneous + +- Added ``n_jobs`` parameter to :class:`feature_selection.RFECV` to compute + the score on the test folds in parallel. By `Manoj Kumar`_ + +- Codebase does not contain C/C++ cython generated files: they are + generated during build. Distribution packages will still contain generated + C/C++ files. By :user:`Arthur Mensch `. + +- Reduce the memory usage for 32-bit float input arrays of + :func:`utils.sparse_func.mean_variance_axis` and + :func:`utils.sparse_func.incr_mean_variance_axis` by supporting cython + fused types. By :user:`YenChen Lin `. + +- The :func:`ignore_warnings` now accept a category argument to ignore only + the warnings of a specified type. By :user:`Thierry Guillemot `. + +- Added parameter ``return_X_y`` and return type ``(data, target) : tuple`` option to + :func:`load_iris` dataset + :issue:`7049`, + :func:`load_breast_cancer` dataset + :issue:`7152`, + :func:`load_digits` dataset, + :func:`load_diabetes` dataset, + :func:`load_linnerud` dataset, + :func:`load_boston` dataset + :issue:`7154` by + :user:`Manvendra Singh`. + +- Simplification of the ``clone`` function, deprecate support for estimators + that modify parameters in ``__init__``. :issue:`5540` by `Andreas Müller`_. + +- When unpickling a scikit-learn estimator in a different version than the one + the estimator was trained with, a ``UserWarning`` is raised, see :ref:`the documentation + on model persistence ` for more details. (:issue:`7248`) + By `Andreas Müller`_. + +Bug fixes +......... + +Trees and ensembles + +- Random forest, extra trees, decision trees and gradient boosting + won't accept anymore ``min_samples_split=1`` as at least 2 samples + are required to split a decision tree node. By `Arnaud Joly`_ + +- :class:`ensemble.VotingClassifier` now raises ``NotFittedError`` if ``predict``, + ``transform`` or ``predict_proba`` are called on the non-fitted estimator. + by `Sebastian Raschka`_. + +- Fix bug where :class:`ensemble.AdaBoostClassifier` and + :class:`ensemble.AdaBoostRegressor` would perform poorly if the + ``random_state`` was fixed + (:issue:`7411`). By `Joel Nothman`_. + +- Fix bug in ensembles with randomization where the ensemble would not + set ``random_state`` on base estimators in a pipeline or similar nesting. + (:issue:`7411`). Note, results for :class:`ensemble.BaggingClassifier` + :class:`ensemble.BaggingRegressor`, :class:`ensemble.AdaBoostClassifier` + and :class:`ensemble.AdaBoostRegressor` will now differ from previous + versions. By `Joel Nothman`_. + +Linear, kernelized and related models + +- Fixed incorrect gradient computation for ``loss='squared_epsilon_insensitive'`` in + :class:`linear_model.SGDClassifier` and :class:`linear_model.SGDRegressor` + (:issue:`6764`). By :user:`Wenhua Yang `. + +- Fix bug in :class:`linear_model.LogisticRegressionCV` where + ``solver='liblinear'`` did not accept ``class_weights='balanced``. + (:issue:`6817`). By `Tom Dupre la Tour`_. + +- Fix bug in :class:`neighbors.RadiusNeighborsClassifier` where an error + occurred when there were outliers being labelled and a weight function + specified (:issue:`6902`). By + `LeonieBorne `_. + +- Fix :class:`linear_model.ElasticNet` sparse decision function to match + output with dense in the multioutput case. + +Decomposition, manifold learning and clustering + +- :class:`decomposition.RandomizedPCA` default number of `iterated_power` is 4 instead of 3. + :issue:`5141` by :user:`Giorgio Patrini `. + +- :func:`utils.extmath.randomized_svd` performs 4 power iterations by default, instead or 0. + In practice this is enough for obtaining a good approximation of the + true eigenvalues/vectors in the presence of noise. When `n_components` is + small (``< .1 * min(X.shape)``) `n_iter` is set to 7, unless the user specifies + a higher number. This improves precision with few components. + :issue:`5299` by :user:`Giorgio Patrini`. + +- Whiten/non-whiten inconsistency between components of :class:`decomposition.PCA` + and :class:`decomposition.RandomizedPCA` (now factored into PCA, see the + New features) is fixed. `components_` are stored with no whitening. + :issue:`5299` by :user:`Giorgio Patrini `. + +- Fixed bug in :func:`manifold.spectral_embedding` where diagonal of unnormalized + Laplacian matrix was incorrectly set to 1. :issue:`4995` by :user:`Peter Fischer `. + +- Fixed incorrect initialization of :func:`utils.arpack.eigsh` on all + occurrences. Affects :class:`cluster.bicluster.SpectralBiclustering`, + :class:`decomposition.KernelPCA`, :class:`manifold.LocallyLinearEmbedding`, + and :class:`manifold.SpectralEmbedding` (:issue:`5012`). By + :user:`Peter Fischer `. + +- Attribute ``explained_variance_ratio_`` calculated with the SVD solver + of :class:`discriminant_analysis.LinearDiscriminantAnalysis` now returns + correct results. By :user:`JPFrancoia ` + +Preprocessing and feature selection + +- :func:`preprocessing.data._transform_selected` now always passes a copy + of ``X`` to transform function when ``copy=True`` (:issue:`7194`). By `Caio + Oliveira `_. + +Model evaluation and meta-estimators + +- :class:`model_selection.StratifiedKFold` now raises error if all n_labels + for individual classes is less than n_folds. + :issue:`6182` by :user:`Devashish Deshpande `. + +- Fixed bug in :class:`model_selection.StratifiedShuffleSplit` + where train and test sample could overlap in some edge cases, + see :issue:`6121` for + more details. By `Loic Esteve`_. + +- Fix in :class:`sklearn.model_selection.StratifiedShuffleSplit` to + return splits of size ``train_size`` and ``test_size`` in all cases + (:issue:`6472`). By `Andreas Müller`_. + +- Cross-validation of :class:`OneVsOneClassifier` and + :class:`OneVsRestClassifier` now works with precomputed kernels. + :issue:`7350` by :user:`Russell Smith `. + +- Fix incomplete ``predict_proba`` method delegation from + :class:`model_selection.GridSearchCV` to + :class:`linear_model.SGDClassifier` (:issue:`7159`) + by `Yichuan Liu `_. + +Metrics + +- Fix bug in :func:`metrics.silhouette_score` in which clusters of + size 1 were incorrectly scored. They should get a score of 0. + By `Joel Nothman`_. + +- Fix bug in :func:`metrics.silhouette_samples` so that it now works with + arbitrary labels, not just those ranging from 0 to n_clusters - 1. + +- Fix bug where expected and adjusted mutual information were incorrect if + cluster contingency cells exceeded ``2**16``. By `Joel Nothman`_. + +- :func:`metrics.pairwise.pairwise_distances` now converts arrays to + boolean arrays when required in ``scipy.spatial.distance``. + :issue:`5460` by `Tom Dupre la Tour`_. + +- Fix sparse input support in :func:`metrics.silhouette_score` as well as + example examples/text/document_clustering.py. By :user:`YenChen Lin `. + +- :func:`metrics.roc_curve` and :func:`metrics.precision_recall_curve` no + longer round ``y_score`` values when creating ROC curves; this was causing + problems for users with very small differences in scores (:issue:`7353`). + +Miscellaneous + +- :func:`model_selection.tests._search._check_param_grid` now works correctly with all types + that extends/implements `Sequence` (except string), including range (Python 3.x) and xrange + (Python 2.x). :issue:`7323` by Viacheslav Kovalevskyi. + +- :func:`utils.extmath.randomized_range_finder` is more numerically stable when many + power iterations are requested, since it applies LU normalization by default. + If ``n_iter<2`` numerical issues are unlikely, thus no normalization is applied. + Other normalization options are available: ``'none', 'LU'`` and ``'QR'``. + :issue:`5141` by :user:`Giorgio Patrini `. + +- Fix a bug where some formats of ``scipy.sparse`` matrix, and estimators + with them as parameters, could not be passed to :func:`base.clone`. + By `Loic Esteve`_. + +- :func:`datasets.load_svmlight_file` now is able to read long int QID values. + :issue:`7101` by :user:`Ibraim Ganiev `. + + +API changes summary +------------------- + +Linear, kernelized and related models + +- ``residual_metric`` has been deprecated in :class:`linear_model.RANSACRegressor`. + Use ``loss`` instead. By `Manoj Kumar`_. + +- Access to public attributes ``.X_`` and ``.y_`` has been deprecated in + :class:`isotonic.IsotonicRegression`. By :user:`Jonathan Arfa `. + +Decomposition, manifold learning and clustering + +- The old :class:`mixture.DPGMM` is deprecated in favor of the new + :class:`mixture.BayesianGaussianMixture` (with the parameter + ``weight_concentration_prior_type='dirichlet_process'``). + The new class solves the computational + problems of the old class and computes the Gaussian mixture with a + Dirichlet process prior faster than before. + :issue:`7295` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. + +- The old :class:`mixture.VBGMM` is deprecated in favor of the new + :class:`mixture.BayesianGaussianMixture` (with the parameter + ``weight_concentration_prior_type='dirichlet_distribution'``). + The new class solves the computational + problems of the old class and computes the Variational Bayesian Gaussian + mixture faster than before. + :issue:`6651` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. + +- The old :class:`mixture.GMM` is deprecated in favor of the new + :class:`mixture.GaussianMixture`. The new class computes the Gaussian mixture + faster than before and some of computational problems have been solved. + :issue:`6666` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. + +Model evaluation and meta-estimators + +- The :mod:`sklearn.cross_validation`, :mod:`sklearn.grid_search` and + :mod:`sklearn.learning_curve` have been deprecated and the classes and + functions have been reorganized into the :mod:`sklearn.model_selection` + module. Ref :ref:`model_selection_changes` for more information. + :issue:`4294` by `Raghav RV`_. + +- The ``grid_scores_`` attribute of :class:`model_selection.GridSearchCV` + and :class:`model_selection.RandomizedSearchCV` is deprecated in favor of + the attribute ``cv_results_``. + Ref :ref:`model_selection_changes` for more information. + :issue:`6697` by `Raghav RV`_. + +- The parameters ``n_iter`` or ``n_folds`` in old CV splitters are replaced + by the new parameter ``n_splits`` since it can provide a consistent + and unambiguous interface to represent the number of train-test splits. + :issue:`7187` by :user:`YenChen Lin `. + +- ``classes`` parameter was renamed to ``labels`` in + :func:`metrics.hamming_loss`. :issue:`7260` by :user:`Sebastián Vanrell `. + +- The splitter classes ``LabelKFold``, ``LabelShuffleSplit``, + ``LeaveOneLabelOut`` and ``LeavePLabelsOut`` are renamed to + :class:`model_selection.GroupKFold`, + :class:`model_selection.GroupShuffleSplit`, + :class:`model_selection.LeaveOneGroupOut` + and :class:`model_selection.LeavePGroupsOut` respectively. + Also the parameter ``labels`` in the :func:`split` method of the newly + renamed splitters :class:`model_selection.LeaveOneGroupOut` and + :class:`model_selection.LeavePGroupsOut` is renamed to + ``groups``. Additionally in :class:`model_selection.LeavePGroupsOut`, + the parameter ``n_labels`` is renamed to ``n_groups``. + :issue:`6660` by `Raghav RV`_. + +- Error and loss names for ``scoring`` parameters are now prefixed by + ``'neg_'``, such as ``neg_mean_squared_error``. The unprefixed versions + are deprecated and will be removed in version 0.20. + :issue:`7261` by :user:`Tim Head `. + +Code Contributors +----------------- +Aditya Joshi, Alejandro, Alexander Fabisch, Alexander Loginov, Alexander +Minyushkin, Alexander Rudy, Alexandre Abadie, Alexandre Abraham, Alexandre +Gramfort, Alexandre Saint, alexfields, Alvaro Ulloa, alyssaq, Amlan Kar, +Andreas Mueller, andrew giessel, Andrew Jackson, Andrew McCulloh, Andrew +Murray, Anish Shah, Arafat, Archit Sharma, Ariel Rokem, Arnaud Joly, Arnaud +Rachez, Arthur Mensch, Ash Hoover, asnt, b0noI, Behzad Tabibian, Bernardo, +Bernhard Kratzwald, Bhargav Mangipudi, blakeflei, Boyuan Deng, Brandon Carter, +Brett Naul, Brian McFee, Caio Oliveira, Camilo Lamus, Carol Willing, Cass, +CeShine Lee, Charles Truong, Chyi-Kwei Yau, CJ Carey, codevig, Colin Ni, Dan +Shiebler, Daniel, Daniel Hnyk, David Ellis, David Nicholson, David Staub, David +Thaler, David Warshaw, Davide Lasagna, Deborah, definitelyuncertain, Didi +Bar-Zev, djipey, dsquareindia, edwinENSAE, Elias Kuthe, Elvis DOHMATOB, Ethan +White, Fabian Pedregosa, Fabio Ticconi, fisache, Florian Wilhelm, Francis, +Francis O'Donovan, Gael Varoquaux, Ganiev Ibraim, ghg, Gilles Louppe, Giorgio +Patrini, Giovanni Cherubin, Giovanni Lanzani, Glenn Qian, Gordon +Mohr, govin-vatsan, Graham Clenaghan, Greg Reda, Greg Stupp, Guillaume +Lemaitre, Gustav Mörtberg, halwai, Harizo Rajaona, Harry Mavroforakis, +hashcode55, hdmetor, Henry Lin, Hobson Lane, Hugo Bowne-Anderson, +Igor Andriushchenko, Imaculate, Inki Hwang, Isaac Sijaranamual, +Ishank Gulati, Issam Laradji, Iver Jordal, jackmartin, Jacob Schreiber, Jake +Vanderplas, James Fiedler, James Routley, Jan Zikes, Janna Brettingen, jarfa, Jason +Laska, jblackburne, jeff levesque, Jeffrey Blackburne, Jeffrey04, Jeremy Hintz, +jeremynixon, Jeroen, Jessica Yung, Jill-Jênn Vie, Jimmy Jia, Jiyuan Qian, Joel +Nothman, johannah, John, John Boersma, John Kirkham, John Moeller, +jonathan.striebel, joncrall, Jordi, Joseph Munoz, Joshua Cook, JPFrancoia, +jrfiedler, JulianKahnert, juliathebrave, kaichogami, KamalakerDadi, Kenneth +Lyons, Kevin Wang, kingjr, kjell, Konstantin Podshumok, Kornel Kielczewski, +Krishna Kalyan, krishnakalyan3, Kvle Putnam, Kyle Jackson, Lars Buitinck, +ldavid, LeiG, LeightonZhang, Leland McInnes, Liang-Chi Hsieh, Lilian Besson, +lizsz, Loic Esteve, Louis Tiao, Léonie Borne, Mads Jensen, Maniteja Nandana, +Manoj Kumar, Manvendra Singh, Marco, Mario Krell, Mark Bao, Mark Szepieniec, +Martin Madsen, MartinBpr, MaryanMorel, Massil, Matheus, Mathieu Blondel, +Mathieu Dubois, Matteo, Matthias Ekman, Max Moroz, Michael Scherer, michiaki +ariga, Mikhail Korobov, Moussa Taifi, mrandrewandrade, Mridul Seth, nadya-p, +Naoya Kanai, Nate George, Nelle Varoquaux, Nelson Liu, Nick James, +NickleDave, Nico, Nicolas Goix, Nikolay Mayorov, ningchi, nlathia, +okbalefthanded, Okhlopkov, Olivier Grisel, Panos Louridas, Paul Strickland, +Perrine Letellier, pestrickland, Peter Fischer, Pieter, Ping-Yao, Chang, +practicalswift, Preston Parry, Qimu Zheng, Rachit Kansal, Raghav RV, +Ralf Gommers, Ramana.S, Rammig, Randy Olson, Rob Alexander, Robert Lutz, +Robin Schucker, Rohan Jain, Ruifeng Zheng, Ryan Yu, Rémy Léone, saihttam, +Saiwing Yeung, Sam Shleifer, Samuel St-Jean, Sartaj Singh, Sasank Chilamkurthy, +saurabh.bansod, Scott Andrews, Scott Lowe, seales, Sebastian Raschka, Sebastian +Saeger, Sebastián Vanrell, Sergei Lebedev, shagun Sodhani, shanmuga cv, +Shashank Shekhar, shawpan, shengxiduan, Shota, shuckle16, Skipper Seabold, +sklearn-ci, SmedbergM, srvanrell, Sébastien Lerique, Taranjeet, themrmax, +Thierry, Thierry Guillemot, Thomas, Thomas Hallock, Thomas Moreau, Tim Head, +tKammy, toastedcornflakes, Tom, TomDLT, Toshihiro Kamishima, tracer0tong, Trent +Hauck, trevorstephens, Tue Vo, Varun, Varun Jewalikar, Viacheslav, Vighnesh +Birodkar, Vikram, Villu Ruusmann, Vinayak Mehta, walter, waterponey, Wenhua +Yang, Wenjian Huang, Will Welch, wyseguy7, xyguo, yanlend, Yaroslav Halchenko, +yelite, Yen, YenChenLin, Yichuan Liu, Yoav Ram, Yoshiki, Zheng RuiFeng, zivori, Óscar Nájera + +.. currentmodule:: sklearn + +.. _changes_0_17_1: + +Version 0.17.1 +============== + +**February 18, 2016** + +Changelog +--------- + +Bug fixes +......... + + +- Upgrade vendored joblib to version 0.9.4 that fixes an important bug in + ``joblib.Parallel`` that can silently yield to wrong results when working + on datasets larger than 1MB: + https://github.com/joblib/joblib/blob/0.9.4/CHANGES.rst + +- Fixed reading of Bunch pickles generated with scikit-learn + version <= 0.16. This can affect users who have already + downloaded a dataset with scikit-learn 0.16 and are loading it + with scikit-learn 0.17. See :issue:`6196` for + how this affected :func:`datasets.fetch_20newsgroups`. By `Loic + Esteve`_. + +- Fixed a bug that prevented using ROC AUC score to perform grid search on + several CPU / cores on large arrays. See :issue:`6147` + By `Olivier Grisel`_. + +- Fixed a bug that prevented to properly set the ``presort`` parameter + in :class:`ensemble.GradientBoostingRegressor`. See :issue:`5857` + By Andrew McCulloh. + +- Fixed a joblib error when evaluating the perplexity of a + :class:`decomposition.LatentDirichletAllocation` model. See :issue:`6258` + By Chyi-Kwei Yau. + + +.. _changes_0_17: + +Version 0.17 +============ + +**November 5, 2015** + +Changelog +--------- + +New features +............ + +- All the Scaler classes but :class:`preprocessing.RobustScaler` can be fitted online by + calling `partial_fit`. By :user:`Giorgio Patrini `. + +- The new class :class:`ensemble.VotingClassifier` implements a + "majority rule" / "soft voting" ensemble classifier to combine + estimators for classification. By `Sebastian Raschka`_. + +- The new class :class:`preprocessing.RobustScaler` provides an + alternative to :class:`preprocessing.StandardScaler` for feature-wise + centering and range normalization that is robust to outliers. + By :user:`Thomas Unterthiner `. + +- The new class :class:`preprocessing.MaxAbsScaler` provides an + alternative to :class:`preprocessing.MinMaxScaler` for feature-wise + range normalization when the data is already centered or sparse. + By :user:`Thomas Unterthiner `. + +- The new class :class:`preprocessing.FunctionTransformer` turns a Python + function into a ``Pipeline``-compatible transformer object. + By Joe Jevnik. + +- The new classes :class:`cross_validation.LabelKFold` and + :class:`cross_validation.LabelShuffleSplit` generate train-test folds, + respectively similar to :class:`cross_validation.KFold` and + :class:`cross_validation.ShuffleSplit`, except that the folds are + conditioned on a label array. By `Brian McFee`_, :user:`Jean + Kossaifi ` and `Gilles Louppe`_. + +- :class:`decomposition.LatentDirichletAllocation` implements the Latent + Dirichlet Allocation topic model with online variational + inference. By :user:`Chyi-Kwei Yau `, with code based on an implementation + by Matt Hoffman. (:issue:`3659`) + +- The new solver ``sag`` implements a Stochastic Average Gradient descent + and is available in both :class:`linear_model.LogisticRegression` and + :class:`linear_model.Ridge`. This solver is very efficient for large + datasets. By :user:`Danny Sullivan ` and `Tom Dupre la Tour`_. + (:issue:`4738`) + +- The new solver ``cd`` implements a Coordinate Descent in + :class:`decomposition.NMF`. Previous solver based on Projected Gradient is + still available setting new parameter ``solver`` to ``pg``, but is + deprecated and will be removed in 0.19, along with + :class:`decomposition.ProjectedGradientNMF` and parameters ``sparseness``, + ``eta``, ``beta`` and ``nls_max_iter``. New parameters ``alpha`` and + ``l1_ratio`` control L1 and L2 regularization, and ``shuffle`` adds a + shuffling step in the ``cd`` solver. + By `Tom Dupre la Tour`_ and `Mathieu Blondel`_. + +Enhancements +............ +- :class:`manifold.TSNE` now supports approximate optimization via the + Barnes-Hut method, leading to much faster fitting. By Christopher Erick Moody. + (:issue:`4025`) + +- :class:`cluster.mean_shift_.MeanShift` now supports parallel execution, + as implemented in the ``mean_shift`` function. By :user:`Martino + Sorbaro `. + +- :class:`naive_bayes.GaussianNB` now supports fitting with ``sample_weight``. + By `Jan Hendrik Metzen`_. + +- :class:`dummy.DummyClassifier` now supports a prior fitting strategy. + By `Arnaud Joly`_. + +- Added a ``fit_predict`` method for :class:`mixture.GMM` and subclasses. + By :user:`Cory Lorenz `. + +- Added the :func:`metrics.label_ranking_loss` metric. + By `Arnaud Joly`_. + +- Added the :func:`metrics.cohen_kappa_score` metric. + +- Added a ``warm_start`` constructor parameter to the bagging ensemble + models to increase the size of the ensemble. By :user:`Tim Head `. + +- Added option to use multi-output regression metrics without averaging. + By Konstantin Shmelkov and :user:`Michael Eickenberg`. + +- Added ``stratify`` option to :func:`cross_validation.train_test_split` + for stratified splitting. By Miroslav Batchkarov. + +- The :func:`tree.export_graphviz` function now supports aesthetic + improvements for :class:`tree.DecisionTreeClassifier` and + :class:`tree.DecisionTreeRegressor`, including options for coloring nodes + by their majority class or impurity, showing variable names, and using + node proportions instead of raw sample counts. By `Trevor Stephens`_. + +- Improved speed of ``newton-cg`` solver in + :class:`linear_model.LogisticRegression`, by avoiding loss computation. + By `Mathieu Blondel`_ and `Tom Dupre la Tour`_. + +- The ``class_weight="auto"`` heuristic in classifiers supporting + ``class_weight`` was deprecated and replaced by the ``class_weight="balanced"`` + option, which has a simpler formula and interpretation. + By `Hanna Wallach`_ and `Andreas Müller`_. + +- Add ``class_weight`` parameter to automatically weight samples by class + frequency for :class:`linear_model.PassiveAgressiveClassifier`. By + `Trevor Stephens`_. + +- Added backlinks from the API reference pages to the user guide. By + `Andreas Müller`_. + +- The ``labels`` parameter to :func:`sklearn.metrics.f1_score`, + :func:`sklearn.metrics.fbeta_score`, + :func:`sklearn.metrics.recall_score` and + :func:`sklearn.metrics.precision_score` has been extended. + It is now possible to ignore one or more labels, such as where + a multiclass problem has a majority class to ignore. By `Joel Nothman`_. + +- Add ``sample_weight`` support to :class:`linear_model.RidgeClassifier`. + By `Trevor Stephens`_. + +- Provide an option for sparse output from + :func:`sklearn.metrics.pairwise.cosine_similarity`. By + :user:`Jaidev Deshpande `. + +- Add :func:`minmax_scale` to provide a function interface for + :class:`MinMaxScaler`. By :user:`Thomas Unterthiner `. + +- ``dump_svmlight_file`` now handles multi-label datasets. + By Chih-Wei Chang. + +- RCV1 dataset loader (:func:`sklearn.datasets.fetch_rcv1`). + By `Tom Dupre la Tour`_. + +- The "Wisconsin Breast Cancer" classical two-class classification dataset + is now included in scikit-learn, available with + :func:`sklearn.dataset.load_breast_cancer`. + +- Upgraded to joblib 0.9.3 to benefit from the new automatic batching of + short tasks. This makes it possible for scikit-learn to benefit from + parallelism when many very short tasks are executed in parallel, for + instance by the :class:`grid_search.GridSearchCV` meta-estimator + with ``n_jobs > 1`` used with a large grid of parameters on a small + dataset. By `Vlad Niculae`_, `Olivier Grisel`_ and `Loic Esteve`_. + +- For more details about changes in joblib 0.9.3 see the release notes: + https://github.com/joblib/joblib/blob/master/CHANGES.rst#release-093 + +- Improved speed (3 times per iteration) of + :class:`decomposition.DictLearning` with coordinate descent method + from :class:`linear_model.Lasso`. By :user:`Arthur Mensch `. + +- Parallel processing (threaded) for queries of nearest neighbors + (using the ball-tree) by Nikolay Mayorov. + +- Allow :func:`datasets.make_multilabel_classification` to output + a sparse ``y``. By Kashif Rasul. + +- :class:`cluster.DBSCAN` now accepts a sparse matrix of precomputed + distances, allowing memory-efficient distance precomputation. By + `Joel Nothman`_. + +- :class:`tree.DecisionTreeClassifier` now exposes an ``apply`` method + for retrieving the leaf indices samples are predicted as. By + :user:`Daniel Galvez ` and `Gilles Louppe`_. + +- Speed up decision tree regressors, random forest regressors, extra trees + regressors and gradient boosting estimators by computing a proxy + of the impurity improvement during the tree growth. The proxy quantity is + such that the split that maximizes this value also maximizes the impurity + improvement. By `Arnaud Joly`_, :user:`Jacob Schreiber ` + and `Gilles Louppe`_. + +- Speed up tree based methods by reducing the number of computations needed + when computing the impurity measure taking into account linear + relationship of the computed statistics. The effect is particularly + visible with extra trees and on datasets with categorical or sparse + features. By `Arnaud Joly`_. + +- :class:`ensemble.GradientBoostingRegressor` and + :class:`ensemble.GradientBoostingClassifier` now expose an ``apply`` + method for retrieving the leaf indices each sample ends up in under + each try. By :user:`Jacob Schreiber `. + +- Add ``sample_weight`` support to :class:`linear_model.LinearRegression`. + By Sonny Hu. (:issue:`#4881`) + +- Add ``n_iter_without_progress`` to :class:`manifold.TSNE` to control + the stopping criterion. By Santi Villalba. (:issue:`5186`) + +- Added optional parameter ``random_state`` in :class:`linear_model.Ridge` + , to set the seed of the pseudo random generator used in ``sag`` solver. By `Tom Dupre la Tour`_. + +- Added optional parameter ``warm_start`` in + :class:`linear_model.LogisticRegression`. If set to True, the solvers + ``lbfgs``, ``newton-cg`` and ``sag`` will be initialized with the + coefficients computed in the previous fit. By `Tom Dupre la Tour`_. + +- Added ``sample_weight`` support to :class:`linear_model.LogisticRegression` for + the ``lbfgs``, ``newton-cg``, and ``sag`` solvers. By `Valentin Stolbunov`_. + Support added to the ``liblinear`` solver. By `Manoj Kumar`_. + +- Added optional parameter ``presort`` to :class:`ensemble.GradientBoostingRegressor` + and :class:`ensemble.GradientBoostingClassifier`, keeping default behavior + the same. This allows gradient boosters to turn off presorting when building + deep trees or using sparse data. By :user:`Jacob Schreiber `. + +- Altered :func:`metrics.roc_curve` to drop unnecessary thresholds by + default. By :user:`Graham Clenaghan `. + +- Added :class:`feature_selection.SelectFromModel` meta-transformer which can + be used along with estimators that have `coef_` or `feature_importances_` + attribute to select important features of the input data. By + :user:`Maheshakya Wijewardena `, `Joel Nothman`_ and `Manoj Kumar`_. + +- Added :func:`metrics.pairwise.laplacian_kernel`. By `Clyde Fare `_. + +- :class:`covariance.GraphLasso` allows separate control of the convergence criterion + for the Elastic-Net subproblem via the ``enet_tol`` parameter. + +- Improved verbosity in :class:`decomposition.DictionaryLearning`. + +- :class:`ensemble.RandomForestClassifier` and + :class:`ensemble.RandomForestRegressor` no longer explicitly store the + samples used in bagging, resulting in a much reduced memory footprint for + storing random forest models. + +- Added ``positive`` option to :class:`linear_model.Lars` and + :func:`linear_model.lars_path` to force coefficients to be positive. + (:issue:`5131`) + +- Added the ``X_norm_squared`` parameter to :func:`metrics.pairwise.euclidean_distances` + to provide precomputed squared norms for ``X``. + +- Added the ``fit_predict`` method to :class:`pipeline.Pipeline`. + +- Added the :func:`preprocessing.min_max_scale` function. + +Bug fixes +......... + +- Fixed non-determinism in :class:`dummy.DummyClassifier` with sparse + multi-label output. By `Andreas Müller`_. + +- Fixed the output shape of :class:`linear_model.RANSACRegressor` to + ``(n_samples, )``. By `Andreas Müller`_. + +- Fixed bug in :class:`decomposition.DictLearning` when ``n_jobs < 0``. By + `Andreas Müller`_. + +- Fixed bug where :class:`grid_search.RandomizedSearchCV` could consume a + lot of memory for large discrete grids. By `Joel Nothman`_. + +- Fixed bug in :class:`linear_model.LogisticRegressionCV` where `penalty` was ignored + in the final fit. By `Manoj Kumar`_. + +- Fixed bug in :class:`ensemble.forest.ForestClassifier` while computing + oob_score and X is a sparse.csc_matrix. By :user:`Ankur Ankan `. + +- All regressors now consistently handle and warn when given ``y`` that is of + shape ``(n_samples, 1)``. By `Andreas Müller`_ and Henry Lin. + (:issue:`5431`) + +- Fix in :class:`cluster.KMeans` cluster reassignment for sparse input by + `Lars Buitinck`_. + +- Fixed a bug in :class:`lda.LDA` that could cause asymmetric covariance + matrices when using shrinkage. By `Martin Billinger`_. + +- Fixed :func:`cross_validation.cross_val_predict` for estimators with + sparse predictions. By Buddha Prakash. + +- Fixed the ``predict_proba`` method of :class:`linear_model.LogisticRegression` + to use soft-max instead of one-vs-rest normalization. By `Manoj Kumar`_. + (:issue:`5182`) + +- Fixed the :func:`partial_fit` method of :class:`linear_model.SGDClassifier` + when called with ``average=True``. By :user:`Andrew Lamb `. + (:issue:`5282`) + +- Dataset fetchers use different filenames under Python 2 and Python 3 to + avoid pickling compatibility issues. By `Olivier Grisel`_. + (:issue:`5355`) + +- Fixed a bug in :class:`naive_bayes.GaussianNB` which caused classification + results to depend on scale. By `Jake Vanderplas`_. + +- Fixed temporarily :class:`linear_model.Ridge`, which was incorrect + when fitting the intercept in the case of sparse data. The fix + automatically changes the solver to 'sag' in this case. + :issue:`5360` by `Tom Dupre la Tour`_. + +- Fixed a performance bug in :class:`decomposition.RandomizedPCA` on data + with a large number of features and fewer samples. (:issue:`4478`) + By `Andreas Müller`_, `Loic Esteve`_ and :user:`Giorgio Patrini `. + +- Fixed bug in :class:`cross_decomposition.PLS` that yielded unstable and + platform dependent output, and failed on `fit_transform`. + By :user:`Arthur Mensch `. + +- Fixes to the ``Bunch`` class used to store datasets. + +- Fixed :func:`ensemble.plot_partial_dependence` ignoring the + ``percentiles`` parameter. + +- Providing a ``set`` as vocabulary in ``CountVectorizer`` no longer + leads to inconsistent results when pickling. + +- Fixed the conditions on when a precomputed Gram matrix needs to + be recomputed in :class:`linear_model.LinearRegression`, + :class:`linear_model.OrthogonalMatchingPursuit`, + :class:`linear_model.Lasso` and :class:`linear_model.ElasticNet`. + +- Fixed inconsistent memory layout in the coordinate descent solver + that affected :class:`linear_model.DictionaryLearning` and + :class:`covariance.GraphLasso`. (:issue:`5337`) + By `Olivier Grisel`_. + +- :class:`manifold.LocallyLinearEmbedding` no longer ignores the ``reg`` + parameter. + +- Nearest Neighbor estimators with custom distance metrics can now be pickled. + (:issue:`4362`) + +- Fixed a bug in :class:`pipeline.FeatureUnion` where ``transformer_weights`` + were not properly handled when performing grid-searches. + +- Fixed a bug in :class:`linear_model.LogisticRegression` and + :class:`linear_model.LogisticRegressionCV` when using + ``class_weight='balanced'```or ``class_weight='auto'``. + By `Tom Dupre la Tour`_. + +- Fixed bug :issue:`5495` when + doing OVR(SVC(decision_function_shape="ovr")). Fixed by + :user:`Elvis Dohmatob `. + + +API changes summary +------------------- +- Attribute `data_min`, `data_max` and `data_range` in + :class:`preprocessing.MinMaxScaler` are deprecated and won't be available + from 0.19. Instead, the class now exposes `data_min_`, `data_max_` + and `data_range_`. By :user:`Giorgio Patrini `. + +- All Scaler classes now have an `scale_` attribute, the feature-wise + rescaling applied by their `transform` methods. The old attribute `std_` + in :class:`preprocessing.StandardScaler` is deprecated and superseded + by `scale_`; it won't be available in 0.19. By :user:`Giorgio Patrini `. + +- :class:`svm.SVC`` and :class:`svm.NuSVC` now have an ``decision_function_shape`` + parameter to make their decision function of shape ``(n_samples, n_classes)`` + by setting ``decision_function_shape='ovr'``. This will be the default behavior + starting in 0.19. By `Andreas Müller`_. + +- Passing 1D data arrays as input to estimators is now deprecated as it + caused confusion in how the array elements should be interpreted + as features or as samples. All data arrays are now expected + to be explicitly shaped ``(n_samples, n_features)``. + By :user:`Vighnesh Birodkar `. + +- :class:`lda.LDA` and :class:`qda.QDA` have been moved to + :class:`discriminant_analysis.LinearDiscriminantAnalysis` and + :class:`discriminant_analysis.QuadraticDiscriminantAnalysis`. + +- The ``store_covariance`` and ``tol`` parameters have been moved from + the fit method to the constructor in + :class:`discriminant_analysis.LinearDiscriminantAnalysis` and the + ``store_covariances`` and ``tol`` parameters have been moved from the + fit method to the constructor in + :class:`discriminant_analysis.QuadraticDiscriminantAnalysis`. + +- Models inheriting from ``_LearntSelectorMixin`` will no longer support the + transform methods. (i.e, RandomForests, GradientBoosting, LogisticRegression, + DecisionTrees, SVMs and SGD related models). Wrap these models around the + metatransfomer :class:`feature_selection.SelectFromModel` to remove + features (according to `coefs_` or `feature_importances_`) + which are below a certain threshold value instead. + +- :class:`cluster.KMeans` re-runs cluster-assignments in case of non-convergence, + to ensure consistency of ``predict(X)`` and ``labels_``. By + :user:`Vighnesh Birodkar `. + +- Classifier and Regressor models are now tagged as such using the + ``_estimator_type`` attribute. + +- Cross-validation iterators always provide indices into training and test set, + not boolean masks. + +- The ``decision_function`` on all regressors was deprecated and will be + removed in 0.19. Use ``predict`` instead. + +- :func:`datasets.load_lfw_pairs` is deprecated and will be removed in 0.19. + Use :func:`datasets.fetch_lfw_pairs` instead. + +- The deprecated ``hmm`` module was removed. + +- The deprecated ``Bootstrap`` cross-validation iterator was removed. + +- The deprecated ``Ward`` and ``WardAgglomerative`` classes have been removed. + Use :class:`clustering.AgglomerativeClustering` instead. + +- :func:`cross_validation.check_cv` is now a public function. + +- The property ``residues_`` of :class:`linear_model.LinearRegression` is deprecated + and will be removed in 0.19. + +- The deprecated ``n_jobs`` parameter of :class:`linear_model.LinearRegression` has been moved + to the constructor. + +- Removed deprecated ``class_weight`` parameter from :class:`linear_model.SGDClassifier`'s ``fit`` + method. Use the construction parameter instead. + +- The deprecated support for the sequence of sequences (or list of lists) multilabel + format was removed. To convert to and from the supported binary + indicator matrix format, use + :class:`MultiLabelBinarizer `. + +- The behavior of calling the ``inverse_transform`` method of ``Pipeline.pipeline`` will + change in 0.19. It will no longer reshape one-dimensional input to two-dimensional input. + +- The deprecated attributes ``indicator_matrix_``, ``multilabel_`` and ``classes_`` of + :class:`preprocessing.LabelBinarizer` were removed. + +- Using ``gamma=0`` in :class:`svm.SVC` and :class:`svm.SVR` to automatically set the + gamma to ``1. / n_features`` is deprecated and will be removed in 0.19. + Use ``gamma="auto"`` instead. + +Code Contributors +----------------- +Aaron Schumacher, Adithya Ganesh, akitty, Alexandre Gramfort, Alexey Grigorev, +Ali Baharev, Allen Riddell, Ando Saabas, Andreas Mueller, Andrew Lamb, Anish +Shah, Ankur Ankan, Anthony Erlinger, Ari Rouvinen, Arnaud Joly, Arnaud Rachez, +Arthur Mensch, banilo, Barmaley.exe, benjaminirving, Boyuan Deng, Brett Naul, +Brian McFee, Buddha Prakash, Chi Zhang, Chih-Wei Chang, Christof Angermueller, +Christoph Gohlke, Christophe Bourguignat, Christopher Erick Moody, Chyi-Kwei +Yau, Cindy Sridharan, CJ Carey, Clyde-fare, Cory Lorenz, Dan Blanchard, Daniel +Galvez, Daniel Kronovet, Danny Sullivan, Data1010, David, David D Lowe, David +Dotson, djipey, Dmitry Spikhalskiy, Donne Martin, Dougal J. Sutherland, Dougal +Sutherland, edson duarte, Eduardo Caro, Eric Larson, Eric Martin, Erich +Schubert, Fernando Carrillo, Frank C. Eckert, Frank Zalkow, Gael Varoquaux, +Ganiev Ibraim, Gilles Louppe, Giorgio Patrini, giorgiop, Graham Clenaghan, +Gryllos Prokopis, gwulfs, Henry Lin, Hsuan-Tien Lin, Immanuel Bayer, Ishank +Gulati, Jack Martin, Jacob Schreiber, Jaidev Deshpande, Jake Vanderplas, Jan +Hendrik Metzen, Jean Kossaifi, Jeffrey04, Jeremy, jfraj, Jiali Mei, +Joe Jevnik, Joel Nothman, John Kirkham, John Wittenauer, Joseph, Joshua Loyal, +Jungkook Park, KamalakerDadi, Kashif Rasul, Keith Goodman, Kian Ho, Konstantin +Shmelkov, Kyler Brown, Lars Buitinck, Lilian Besson, Loic Esteve, Louis Tiao, +maheshakya, Maheshakya Wijewardena, Manoj Kumar, MarkTab marktab.net, Martin +Ku, Martin Spacek, MartinBpr, martinosorb, MaryanMorel, Masafumi Oyamada, +Mathieu Blondel, Matt Krump, Matti Lyra, Maxim Kolganov, mbillinger, mhg, +Michael Heilman, Michael Patterson, Miroslav Batchkarov, Nelle Varoquaux, +Nicolas, Nikolay Mayorov, Olivier Grisel, Omer Katz, Óscar Nájera, Pauli +Virtanen, Peter Fischer, Peter Prettenhofer, Phil Roth, pianomania, Preston +Parry, Raghav RV, Rob Zinkov, Robert Layton, Rohan Ramanath, Saket Choudhary, +Sam Zhang, santi, saurabh.bansod, scls19fr, Sebastian Raschka, Sebastian +Saeger, Shivan Sornarajah, SimonPL, sinhrks, Skipper Seabold, Sonny Hu, sseg, +Stephen Hoover, Steven De Gryze, Steven Seguin, Theodore Vasiloudis, Thomas +Unterthiner, Tiago Freitas Pereira, Tian Wang, Tim Head, Timothy Hopper, +tokoroten, Tom Dupré la Tour, Trevor Stephens, Valentin Stolbunov, Vighnesh +Birodkar, Vinayak Mehta, Vincent, Vincent Michel, vstolbunov, wangz10, Wei Xue, +Yucheng Low, Yury Zhauniarovich, Zac Stewart, zhai_pro, Zichen Wang + +.. _changes_0_1_16: + +Version 0.16.1 +=============== + +**April 14, 2015** + +Changelog +--------- + +Bug fixes +......... + +- Allow input data larger than ``block_size`` in + :class:`covariance.LedoitWolf` by `Andreas Müller`_. + +- Fix a bug in :class:`isotonic.IsotonicRegression` deduplication that + caused unstable result in :class:`calibration.CalibratedClassifierCV` by + `Jan Hendrik Metzen`_. + +- Fix sorting of labels in func:`preprocessing.label_binarize` by Michael Heilman. + +- Fix several stability and convergence issues in + :class:`cross_decomposition.CCA` and + :class:`cross_decomposition.PLSCanonical` by `Andreas Müller`_ + +- Fix a bug in :class:`cluster.KMeans` when ``precompute_distances=False`` + on fortran-ordered data. + +- Fix a speed regression in :class:`ensemble.RandomForestClassifier`'s ``predict`` + and ``predict_proba`` by `Andreas Müller`_. + +- Fix a regression where ``utils.shuffle`` converted lists and dataframes to arrays, by `Olivier Grisel`_ + +.. _changes_0_16: + +Version 0.16 +============ + +**March 26, 2015** + +Highlights +----------- + +- Speed improvements (notably in :class:`cluster.DBSCAN`), reduced memory + requirements, bug-fixes and better default settings. + +- Multinomial Logistic regression and a path algorithm in + :class:`linear_model.LogisticRegressionCV`. + +- Out-of core learning of PCA via :class:`decomposition.IncrementalPCA`. + +- Probability callibration of classifiers using + :class:`calibration.CalibratedClassifierCV`. + +- :class:`cluster.Birch` clustering method for large-scale datasets. + +- Scalable approximate nearest neighbors search with Locality-sensitive + hashing forests in :class:`neighbors.LSHForest`. + +- Improved error messages and better validation when using malformed input data. + +- More robust integration with pandas dataframes. + +Changelog +--------- + +New features +............ + +- The new :class:`neighbors.LSHForest` implements locality-sensitive hashing + for approximate nearest neighbors search. By :user:`Maheshakya Wijewardena`. + +- Added :class:`svm.LinearSVR`. This class uses the liblinear implementation + of Support Vector Regression which is much faster for large + sample sizes than :class:`svm.SVR` with linear kernel. By + `Fabian Pedregosa`_ and Qiang Luo. + +- Incremental fit for :class:`GaussianNB `. + +- Added ``sample_weight`` support to :class:`dummy.DummyClassifier` and + :class:`dummy.DummyRegressor`. By `Arnaud Joly`_. + +- Added the :func:`metrics.label_ranking_average_precision_score` metrics. + By `Arnaud Joly`_. + +- Add the :func:`metrics.coverage_error` metrics. By `Arnaud Joly`_. + +- Added :class:`linear_model.LogisticRegressionCV`. By + `Manoj Kumar`_, `Fabian Pedregosa`_, `Gael Varoquaux`_ + and `Alexandre Gramfort`_. + +- Added ``warm_start`` constructor parameter to make it possible for any + trained forest model to grow additional trees incrementally. By + :user:`Laurent Direr`. + +- Added ``sample_weight`` support to :class:`ensemble.GradientBoostingClassifier` and + :class:`ensemble.GradientBoostingRegressor`. By `Peter Prettenhofer`_. + +- Added :class:`decomposition.IncrementalPCA`, an implementation of the PCA + algorithm that supports out-of-core learning with a ``partial_fit`` + method. By `Kyle Kastner`_. + +- Averaged SGD for :class:`SGDClassifier ` + and :class:`SGDRegressor ` By + :user:`Danny Sullivan `. + +- Added :func:`cross_val_predict ` + function which computes cross-validated estimates. By `Luis Pedro Coelho`_ + +- Added :class:`linear_model.TheilSenRegressor`, a robust + generalized-median-based estimator. By :user:`Florian Wilhelm `. + +- Added :func:`metrics.median_absolute_error`, a robust metric. + By `Gael Varoquaux`_ and :user:`Florian Wilhelm `. + +- Add :class:`cluster.Birch`, an online clustering algorithm. By + `Manoj Kumar`_, `Alexandre Gramfort`_ and `Joel Nothman`_. + +- Added shrinkage support to :class:`discriminant_analysis.LinearDiscriminantAnalysis` + using two new solvers. By :user:`Clemens Brunner ` and `Martin Billinger`_. + +- Added :class:`kernel_ridge.KernelRidge`, an implementation of + kernelized ridge regression. + By `Mathieu Blondel`_ and `Jan Hendrik Metzen`_. + +- All solvers in :class:`linear_model.Ridge` now support `sample_weight`. + By `Mathieu Blondel`_. + +- Added :class:`cross_validation.PredefinedSplit` cross-validation + for fixed user-provided cross-validation folds. + By :user:`Thomas Unterthiner `. + +- Added :class:`calibration.CalibratedClassifierCV`, an approach for + calibrating the predicted probabilities of a classifier. + By `Alexandre Gramfort`_, `Jan Hendrik Metzen`_, `Mathieu Blondel`_ + and :user:`Balazs Kegl `. + + +Enhancements +............ + +- Add option ``return_distance`` in :func:`hierarchical.ward_tree` + to return distances between nodes for both structured and unstructured + versions of the algorithm. By `Matteo Visconti di Oleggio Castello`_. + The same option was added in :func:`hierarchical.linkage_tree`. + By `Manoj Kumar`_ + +- Add support for sample weights in scorer objects. Metrics with sample + weight support will automatically benefit from it. By `Noel Dawe`_ and + `Vlad Niculae`_. + +- Added ``newton-cg`` and `lbfgs` solver support in + :class:`linear_model.LogisticRegression`. By `Manoj Kumar`_. + +- Add ``selection="random"`` parameter to implement stochastic coordinate + descent for :class:`linear_model.Lasso`, :class:`linear_model.ElasticNet` + and related. By `Manoj Kumar`_. + +- Add ``sample_weight`` parameter to + :func:`metrics.jaccard_similarity_score` and :func:`metrics.log_loss`. + By :user:`Jatin Shah `. + +- Support sparse multilabel indicator representation in + :class:`preprocessing.LabelBinarizer` and + :class:`multiclass.OneVsRestClassifier` (by :user:`Hamzeh Alsalhi ` with thanks + to Rohit Sivaprasad), as well as evaluation metrics (by + `Joel Nothman`_). + +- Add ``sample_weight`` parameter to `metrics.jaccard_similarity_score`. + By `Jatin Shah`. + +- Add support for multiclass in `metrics.hinge_loss`. Added ``labels=None`` + as optional parameter. By `Saurabh Jha`. + +- Add ``sample_weight`` parameter to `metrics.hinge_loss`. + By `Saurabh Jha`. + +- Add ``multi_class="multinomial"`` option in + :class:`linear_model.LogisticRegression` to implement a Logistic + Regression solver that minimizes the cross-entropy or multinomial loss + instead of the default One-vs-Rest setting. Supports `lbfgs` and + `newton-cg` solvers. By `Lars Buitinck`_ and `Manoj Kumar`_. Solver option + `newton-cg` by Simon Wu. + +- ``DictVectorizer`` can now perform ``fit_transform`` on an iterable in a + single pass, when giving the option ``sort=False``. By :user:`Dan + Blanchard `. + +- :class:`GridSearchCV` and :class:`RandomizedSearchCV` can now be + configured to work with estimators that may fail and raise errors on + individual folds. This option is controlled by the `error_score` + parameter. This does not affect errors raised on re-fit. By + :user:`Michal Romaniuk `. + +- Add ``digits`` parameter to `metrics.classification_report` to allow + report to show different precision of floating point numbers. By + :user:`Ian Gilmore `. + +- Add a quantile prediction strategy to the :class:`dummy.DummyRegressor`. + By :user:`Aaron Staple `. + +- Add ``handle_unknown`` option to :class:`preprocessing.OneHotEncoder` to + handle unknown categorical features more gracefully during transform. + By `Manoj Kumar`_. + +- Added support for sparse input data to decision trees and their ensembles. + By `Fares Hedyati`_ and `Arnaud Joly`_. + +- Optimized :class:`cluster.AffinityPropagation` by reducing the number of + memory allocations of large temporary data-structures. By `Antony Lee`_. + +- Parellization of the computation of feature importances in random forest. + By `Olivier Grisel`_ and `Arnaud Joly`_. + +- Add ``n_iter_`` attribute to estimators that accept a ``max_iter`` attribute + in their constructor. By `Manoj Kumar`_. + +- Added decision function for :class:`multiclass.OneVsOneClassifier` + By `Raghav RV`_ and :user:`Kyle Beauchamp `. + +- :func:`neighbors.kneighbors_graph` and :func:`radius_neighbors_graph` + support non-Euclidean metrics. By `Manoj Kumar`_ + +- Parameter ``connectivity`` in :class:`cluster.AgglomerativeClustering` + and family now accept callables that return a connectivity matrix. + By `Manoj Kumar`_. + +- Sparse support for :func:`paired_distances`. By `Joel Nothman`_. + +- :class:`cluster.DBSCAN` now supports sparse input and sample weights and + has been optimized: the inner loop has been rewritten in Cython and + radius neighbors queries are now computed in batch. By `Joel Nothman`_ + and `Lars Buitinck`_. + +- Add ``class_weight`` parameter to automatically weight samples by class + frequency for :class:`ensemble.RandomForestClassifier`, + :class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier` + and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_. + +- :class:`grid_search.RandomizedSearchCV` now does sampling without + replacement if all parameters are given as lists. By `Andreas Müller`_. + +- Parallelized calculation of :func:`pairwise_distances` is now supported + for scipy metrics and custom callables. By `Joel Nothman`_. + +- Allow the fitting and scoring of all clustering algorithms in + :class:`pipeline.Pipeline`. By `Andreas Müller`_. + +- More robust seeding and improved error messages in :class:`cluster.MeanShift` + by `Andreas Müller`_. + +- Make the stopping criterion for :class:`mixture.GMM`, + :class:`mixture.DPGMM` and :class:`mixture.VBGMM` less dependent on the + number of samples by thresholding the average log-likelihood change + instead of its sum over all samples. By `Hervé Bredin`_. + +- The outcome of :func:`manifold.spectral_embedding` was made deterministic + by flipping the sign of eigenvectors. By :user:`Hasil Sharma `. + +- Significant performance and memory usage improvements in + :class:`preprocessing.PolynomialFeatures`. By `Eric Martin`_. + +- Numerical stability improvements for :class:`preprocessing.StandardScaler` + and :func:`preprocessing.scale`. By `Nicolas Goix`_ + +- :class:`svm.SVC` fitted on sparse input now implements ``decision_function``. + By `Rob Zinkov`_ and `Andreas Müller`_. + +- :func:`cross_validation.train_test_split` now preserves the input type, + instead of converting to numpy arrays. + + +Documentation improvements +.......................... + +- Added example of using :class:`FeatureUnion` for heterogeneous input. + By :user:`Matt Terry ` + +- Documentation on scorers was improved, to highlight the handling of loss + functions. By :user:`Matt Pico `. + +- A discrepancy between liblinear output and scikit-learn's wrappers + is now noted. By `Manoj Kumar`_. + +- Improved documentation generation: examples referring to a class or + function are now shown in a gallery on the class/function's API reference + page. By `Joel Nothman`_. + +- More explicit documentation of sample generators and of data + transformation. By `Joel Nothman`_. + +- :class:`sklearn.neighbors.BallTree` and :class:`sklearn.neighbors.KDTree` + used to point to empty pages stating that they are aliases of BinaryTree. + This has been fixed to show the correct class docs. By `Manoj Kumar`_. + +- Added silhouette plots for analysis of KMeans clustering using + :func:`metrics.silhouette_samples` and :func:`metrics.silhouette_score`. + See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py` + +Bug fixes +......... +- Metaestimators now support ducktyping for the presence of ``decision_function``, + ``predict_proba`` and other methods. This fixes behavior of + :class:`grid_search.GridSearchCV`, + :class:`grid_search.RandomizedSearchCV`, :class:`pipeline.Pipeline`, + :class:`feature_selection.RFE`, :class:`feature_selection.RFECV` when nested. + By `Joel Nothman`_ + +- The ``scoring`` attribute of grid-search and cross-validation methods is no longer + ignored when a :class:`grid_search.GridSearchCV` is given as a base estimator or + the base estimator doesn't have predict. + +- The function :func:`hierarchical.ward_tree` now returns the children in + the same order for both the structured and unstructured versions. By + `Matteo Visconti di Oleggio Castello`_. + +- :class:`feature_selection.RFECV` now correctly handles cases when + ``step`` is not equal to 1. By :user:`Nikolay Mayorov ` + +- The :class:`decomposition.PCA` now undoes whitening in its + ``inverse_transform``. Also, its ``components_`` now always have unit + length. By :user:`Michael Eickenberg `. + +- Fix incomplete download of the dataset when + :func:`datasets.download_20newsgroups` is called. By `Manoj Kumar`_. + +- Various fixes to the Gaussian processes subpackage by Vincent Dubourg + and Jan Hendrik Metzen. + +- Calling ``partial_fit`` with ``class_weight=='auto'`` throws an + appropriate error message and suggests a work around. + By :user:`Danny Sullivan `. + +- :class:`RBFSampler ` with ``gamma=g`` + formerly approximated :func:`rbf_kernel ` + with ``gamma=g/2.``; the definition of ``gamma`` is now consistent, + which may substantially change your results if you use a fixed value. + (If you cross-validated over ``gamma``, it probably doesn't matter + too much.) By :user:`Dougal Sutherland `. + +- Pipeline object delegate the ``classes_`` attribute to the underlying + estimator. It allows, for instance, to make bagging of a pipeline object. + By `Arnaud Joly`_ + +- :class:`neighbors.NearestCentroid` now uses the median as the centroid + when metric is set to ``manhattan``. It was using the mean before. + By `Manoj Kumar`_ + +- Fix numerical stability issues in :class:`linear_model.SGDClassifier` + and :class:`linear_model.SGDRegressor` by clipping large gradients and + ensuring that weight decay rescaling is always positive (for large + l2 regularization and large learning rate values). + By `Olivier Grisel`_ + +- When `compute_full_tree` is set to "auto", the full tree is + built when n_clusters is high and is early stopped when n_clusters is + low, while the behavior should be vice-versa in + :class:`cluster.AgglomerativeClustering` (and friends). + This has been fixed By `Manoj Kumar`_ + +- Fix lazy centering of data in :func:`linear_model.enet_path` and + :func:`linear_model.lasso_path`. It was centered around one. It has + been changed to be centered around the origin. By `Manoj Kumar`_ + +- Fix handling of precomputed affinity matrices in + :class:`cluster.AgglomerativeClustering` when using connectivity + constraints. By :user:`Cathy Deng ` + +- Correct ``partial_fit`` handling of ``class_prior`` for + :class:`sklearn.naive_bayes.MultinomialNB` and + :class:`sklearn.naive_bayes.BernoulliNB`. By `Trevor Stephens`_. + +- Fixed a crash in :func:`metrics.precision_recall_fscore_support` + when using unsorted ``labels`` in the multi-label setting. + By `Andreas Müller`_. + +- Avoid skipping the first nearest neighbor in the methods ``radius_neighbors``, + ``kneighbors``, ``kneighbors_graph`` and ``radius_neighbors_graph`` in + :class:`sklearn.neighbors.NearestNeighbors` and family, when the query + data is not the same as fit data. By `Manoj Kumar`_. + +- Fix log-density calculation in the :class:`mixture.GMM` with + tied covariance. By `Will Dawson`_ + +- Fixed a scaling error in :class:`feature_selection.SelectFdr` + where a factor ``n_features`` was missing. By `Andrew Tulloch`_ + +- Fix zero division in :class:`neighbors.KNeighborsRegressor` and related + classes when using distance weighting and having identical data points. + By `Garret-R `_. + +- Fixed round off errors with non positive-definite covariance matrices + in GMM. By :user:`Alexis Mignon `. + +- Fixed a error in the computation of conditional probabilities in + :class:`naive_bayes.BernoulliNB`. By `Hanna Wallach`_. + +- Make the method ``radius_neighbors`` of + :class:`neighbors.NearestNeighbors` return the samples lying on the + boundary for ``algorithm='brute'``. By `Yan Yi`_. + +- Flip sign of ``dual_coef_`` of :class:`svm.SVC` + to make it consistent with the documentation and + ``decision_function``. By Artem Sobolev. + +- Fixed handling of ties in :class:`isotonic.IsotonicRegression`. + We now use the weighted average of targets (secondary method). By + `Andreas Müller`_ and `Michael Bommarito `_. + +API changes summary +------------------- + +- :class:`GridSearchCV ` and + :func:`cross_val_score ` and other + meta-estimators don't convert pandas DataFrames into arrays any more, + allowing DataFrame specific operations in custom estimators. + +- :func:`multiclass.fit_ovr`, :func:`multiclass.predict_ovr`, + :func:`predict_proba_ovr`, + :func:`multiclass.fit_ovo`, :func:`multiclass.predict_ovo`, + :func:`multiclass.fit_ecoc` and :func:`multiclass.predict_ecoc` + are deprecated. Use the underlying estimators instead. + +- Nearest neighbors estimators used to take arbitrary keyword arguments + and pass these to their distance metric. This will no longer be supported + in scikit-learn 0.18; use the ``metric_params`` argument instead. + +- `n_jobs` parameter of the fit method shifted to the constructor of the + LinearRegression class. + +- The ``predict_proba`` method of :class:`multiclass.OneVsRestClassifier` + now returns two probabilities per sample in the multiclass case; this + is consistent with other estimators and with the method's documentation, + but previous versions accidentally returned only the positive + probability. Fixed by Will Lamond and `Lars Buitinck`_. + +- Change default value of precompute in :class:`ElasticNet` and :class:`Lasso` + to False. Setting precompute to "auto" was found to be slower when + n_samples > n_features since the computation of the Gram matrix is + computationally expensive and outweighs the benefit of fitting the Gram + for just one alpha. + ``precompute="auto"`` is now deprecated and will be removed in 0.18 + By `Manoj Kumar`_. + +- Expose ``positive`` option in :func:`linear_model.enet_path` and + :func:`linear_model.enet_path` which constrains coefficients to be + positive. By `Manoj Kumar`_. + +- Users should now supply an explicit ``average`` parameter to + :func:`sklearn.metrics.f1_score`, :func:`sklearn.metrics.fbeta_score`, + :func:`sklearn.metrics.recall_score` and + :func:`sklearn.metrics.precision_score` when performing multiclass + or multilabel (i.e. not binary) classification. By `Joel Nothman`_. + +- `scoring` parameter for cross validation now accepts `'f1_micro'`, + `'f1_macro'` or `'f1_weighted'`. `'f1'` is now for binary classification + only. Similar changes apply to `'precision'` and `'recall'`. + By `Joel Nothman`_. + +- The ``fit_intercept``, ``normalize`` and ``return_models`` parameters in + :func:`linear_model.enet_path` and :func:`linear_model.lasso_path` have + been removed. They were deprecated since 0.14 + +- From now onwards, all estimators will uniformly raise ``NotFittedError`` + (:class:`utils.validation.NotFittedError`), when any of the ``predict`` + like methods are called before the model is fit. By `Raghav RV`_. + +- Input data validation was refactored for more consistent input + validation. The ``check_arrays`` function was replaced by ``check_array`` + and ``check_X_y``. By `Andreas Müller`_. + +- Allow ``X=None`` in the methods ``radius_neighbors``, ``kneighbors``, + ``kneighbors_graph`` and ``radius_neighbors_graph`` in + :class:`sklearn.neighbors.NearestNeighbors` and family. If set to None, + then for every sample this avoids setting the sample itself as the + first nearest neighbor. By `Manoj Kumar`_. + +- Add parameter ``include_self`` in :func:`neighbors.kneighbors_graph` + and :func:`neighbors.radius_neighbors_graph` which has to be explicitly + set by the user. If set to True, then the sample itself is considered + as the first nearest neighbor. + +- `thresh` parameter is deprecated in favor of new `tol` parameter in + :class:`GMM`, :class:`DPGMM` and :class:`VBGMM`. See `Enhancements` + section for details. By `Hervé Bredin`_. + +- Estimators will treat input with dtype object as numeric when possible. + By `Andreas Müller`_ + +- Estimators now raise `ValueError` consistently when fitted on empty + data (less than 1 sample or less than 1 feature for 2D input). + By `Olivier Grisel`_. + + +- The ``shuffle`` option of :class:`.linear_model.SGDClassifier`, + :class:`linear_model.SGDRegressor`, :class:`linear_model.Perceptron`, + :class:`linear_model.PassiveAgressiveClassifier` and + :class:`linear_model.PassiveAgressiveRegressor` now defaults to ``True``. + +- :class:`cluster.DBSCAN` now uses a deterministic initialization. The + `random_state` parameter is deprecated. By :user:`Erich Schubert `. + +Code Contributors +----------------- +A. Flaxman, Aaron Schumacher, Aaron Staple, abhishek thakur, Akshay, akshayah3, +Aldrian Obaja, Alexander Fabisch, Alexandre Gramfort, Alexis Mignon, Anders +Aagaard, Andreas Mueller, Andreas van Cranenburgh, Andrew Tulloch, Andrew +Walker, Antony Lee, Arnaud Joly, banilo, Barmaley.exe, Ben Davies, Benedikt +Koehler, bhsu, Boris Feld, Borja Ayerdi, Boyuan Deng, Brent Pedersen, Brian +Wignall, Brooke Osborn, Calvin Giles, Cathy Deng, Celeo, cgohlke, chebee7i, +Christian Stade-Schuldt, Christof Angermueller, Chyi-Kwei Yau, CJ Carey, +Clemens Brunner, Daiki Aminaka, Dan Blanchard, danfrankj, Danny Sullivan, David +Fletcher, Dmitrijs Milajevs, Dougal J. Sutherland, Erich Schubert, Fabian +Pedregosa, Florian Wilhelm, floydsoft, Félix-Antoine Fortin, Gael Varoquaux, +Garrett-R, Gilles Louppe, gpassino, gwulfs, Hampus Bengtsson, Hamzeh Alsalhi, +Hanna Wallach, Harry Mavroforakis, Hasil Sharma, Helder, Herve Bredin, +Hsiang-Fu Yu, Hugues SALAMIN, Ian Gilmore, Ilambharathi Kanniah, Imran Haque, +isms, Jake VanderPlas, Jan Dlabal, Jan Hendrik Metzen, Jatin Shah, Javier López +Peña, jdcaballero, Jean Kossaifi, Jeff Hammerbacher, Joel Nothman, Jonathan +Helmus, Joseph, Kaicheng Zhang, Kevin Markham, Kyle Beauchamp, Kyle Kastner, +Lagacherie Matthieu, Lars Buitinck, Laurent Direr, leepei, Loic Esteve, Luis +Pedro Coelho, Lukas Michelbacher, maheshakya, Manoj Kumar, Manuel, Mario +Michael Krell, Martin, Martin Billinger, Martin Ku, Mateusz Susik, Mathieu +Blondel, Matt Pico, Matt Terry, Matteo Visconti dOC, Matti Lyra, Max Linke, +Mehdi Cherti, Michael Bommarito, Michael Eickenberg, Michal Romaniuk, MLG, +mr.Shu, Nelle Varoquaux, Nicola Montecchio, Nicolas, Nikolay Mayorov, Noel +Dawe, Okal Billy, Olivier Grisel, Óscar Nájera, Paolo Puggioni, Peter +Prettenhofer, Pratap Vardhan, pvnguyen, queqichao, Rafael Carrascosa, Raghav R +V, Rahiel Kasim, Randall Mason, Rob Zinkov, Robert Bradshaw, Saket Choudhary, +Sam Nicholls, Samuel Charron, Saurabh Jha, sethdandridge, sinhrks, snuderl, +Stefan Otte, Stefan van der Walt, Steve Tjoa, swu, Sylvain Zimmer, tejesh95, +terrycojones, Thomas Delteil, Thomas Unterthiner, Tomas Kazmar, trevorstephens, +tttthomasssss, Tzu-Ming Kuo, ugurcaliskan, ugurthemaster, Vinayak Mehta, +Vincent Dubourg, Vjacheslav Murashkin, Vlad Niculae, wadawson, Wei Xue, Will +Lamond, Wu Jiang, x0l, Xinfan Meng, Yan Yi, Yu-Chin + +.. _changes_0_15_2: + +Version 0.15.2 +============== + +**September 4, 2014** + +Bug fixes +--------- + +- Fixed handling of the ``p`` parameter of the Minkowski distance that was + previously ignored in nearest neighbors models. By :user:`Nikolay + Mayorov `. + +- Fixed duplicated alphas in :class:`linear_model.LassoLars` with early + stopping on 32 bit Python. By `Olivier Grisel`_ and `Fabian Pedregosa`_. + +- Fixed the build under Windows when scikit-learn is built with MSVC while + NumPy is built with MinGW. By `Olivier Grisel`_ and :user:`Federico + Vaggi `. + +- Fixed an array index overflow bug in the coordinate descent solver. By + `Gael Varoquaux`_. + +- Better handling of numpy 1.9 deprecation warnings. By `Gael Varoquaux`_. + +- Removed unnecessary data copy in :class:`cluster.KMeans`. + By `Gael Varoquaux`_. + +- Explicitly close open files to avoid ``ResourceWarnings`` under Python 3. + By Calvin Giles. + +- The ``transform`` of :class:`discriminant_analysis.LinearDiscriminantAnalysis` + now projects the input on the most discriminant directions. By Martin Billinger. + +- Fixed potential overflow in ``_tree.safe_realloc`` by `Lars Buitinck`_. + +- Performance optimization in :class:`isotonic.IsotonicRegression`. + By Robert Bradshaw. + +- ``nose`` is non-longer a runtime dependency to import ``sklearn``, only for + running the tests. By `Joel Nothman`_. + +- Many documentation and website fixes by `Joel Nothman`_, `Lars Buitinck`_ + :user:`Matt Pico `, and others. + +.. _changes_0_15_1: + +Version 0.15.1 +============== + +**August 1, 2014** + +Bug fixes +--------- + +- Made :func:`cross_validation.cross_val_score` use + :class:`cross_validation.KFold` instead of + :class:`cross_validation.StratifiedKFold` on multi-output classification + problems. By :user:`Nikolay Mayorov `. + +- Support unseen labels :class:`preprocessing.LabelBinarizer` to restore + the default behavior of 0.14.1 for backward compatibility. By + :user:`Hamzeh Alsalhi `. + +- Fixed the :class:`cluster.KMeans` stopping criterion that prevented early + convergence detection. By Edward Raff and `Gael Varoquaux`_. + +- Fixed the behavior of :class:`multiclass.OneVsOneClassifier`. + in case of ties at the per-class vote level by computing the correct + per-class sum of prediction scores. By `Andreas Müller`_. + +- Made :func:`cross_validation.cross_val_score` and + :class:`grid_search.GridSearchCV` accept Python lists as input data. + This is especially useful for cross-validation and model selection of + text processing pipelines. By `Andreas Müller`_. + +- Fixed data input checks of most estimators to accept input data that + implements the NumPy ``__array__`` protocol. This is the case for + for ``pandas.Series`` and ``pandas.DataFrame`` in recent versions of + pandas. By `Gael Varoquaux`_. + +- Fixed a regression for :class:`linear_model.SGDClassifier` with + ``class_weight="auto"`` on data with non-contiguous labels. By + `Olivier Grisel`_. + + +.. _changes_0_15: + +Version 0.15 +============ + +**July 15, 2014** + +Highlights +----------- + +- Many speed and memory improvements all across the code + +- Huge speed and memory improvements to random forests (and extra + trees) that also benefit better from parallel computing. + +- Incremental fit to :class:`BernoulliRBM ` + +- Added :class:`cluster.AgglomerativeClustering` for hierarchical + agglomerative clustering with average linkage, complete linkage and + ward strategies. + +- Added :class:`linear_model.RANSACRegressor` for robust regression + models. + +- Added dimensionality reduction with :class:`manifold.TSNE` which can be + used to visualize high-dimensional data. + + +Changelog +--------- + +New features +............ + +- Added :class:`ensemble.BaggingClassifier` and + :class:`ensemble.BaggingRegressor` meta-estimators for ensembling + any kind of base estimator. See the :ref:`Bagging ` section of + the user guide for details and examples. By `Gilles Louppe`_. + +- New unsupervised feature selection algorithm + :class:`feature_selection.VarianceThreshold`, by `Lars Buitinck`_. + +- Added :class:`linear_model.RANSACRegressor` meta-estimator for the robust + fitting of regression models. By :user:`Johannes Schönberger `. + +- Added :class:`cluster.AgglomerativeClustering` for hierarchical + agglomerative clustering with average linkage, complete linkage and + ward strategies, by `Nelle Varoquaux`_ and `Gael Varoquaux`_. + +- Shorthand constructors :func:`pipeline.make_pipeline` and + :func:`pipeline.make_union` were added by `Lars Buitinck`_. + +- Shuffle option for :class:`cross_validation.StratifiedKFold`. + By :user:`Jeffrey Blackburne `. + +- Incremental learning (``partial_fit``) for Gaussian Naive Bayes by + Imran Haque. + +- Added ``partial_fit`` to :class:`BernoulliRBM + ` + By :user:`Danny Sullivan `. + +- Added :func:`learning_curve ` utility to + chart performance with respect to training size. See + :ref:`sphx_glr_auto_examples_model_selection_plot_learning_curve.py`. By Alexander Fabisch. + +- Add positive option in :class:`LassoCV ` and + :class:`ElasticNetCV `. + By Brian Wignall and `Alexandre Gramfort`_. + +- Added :class:`linear_model.MultiTaskElasticNetCV` and + :class:`linear_model.MultiTaskLassoCV`. By `Manoj Kumar`_. + +- Added :class:`manifold.TSNE`. By Alexander Fabisch. + +Enhancements +............ + +- Add sparse input support to :class:`ensemble.AdaBoostClassifier` and + :class:`ensemble.AdaBoostRegressor` meta-estimators. + By :user:`Hamzeh Alsalhi `. + +- Memory improvements of decision trees, by `Arnaud Joly`_. + +- Decision trees can now be built in best-first manner by using ``max_leaf_nodes`` + as the stopping criteria. Refactored the tree code to use either a + stack or a priority queue for tree building. + By `Peter Prettenhofer`_ and `Gilles Louppe`_. + +- Decision trees can now be fitted on fortran- and c-style arrays, and + non-continuous arrays without the need to make a copy. + If the input array has a different dtype than ``np.float32``, a fortran- + style copy will be made since fortran-style memory layout has speed + advantages. By `Peter Prettenhofer`_ and `Gilles Louppe`_. + +- Speed improvement of regression trees by optimizing the + the computation of the mean square error criterion. This lead + to speed improvement of the tree, forest and gradient boosting tree + modules. By `Arnaud Joly`_ + +- The ``img_to_graph`` and ``grid_tograph`` functions in + :mod:`sklearn.feature_extraction.image` now return ``np.ndarray`` + instead of ``np.matrix`` when ``return_as=np.ndarray``. See the + Notes section for more information on compatibility. + +- Changed the internal storage of decision trees to use a struct array. + This fixed some small bugs, while improving code and providing a small + speed gain. By `Joel Nothman`_. + +- Reduce memory usage and overhead when fitting and predicting with forests + of randomized trees in parallel with ``n_jobs != 1`` by leveraging new + threading backend of joblib 0.8 and releasing the GIL in the tree fitting + Cython code. By `Olivier Grisel`_ and `Gilles Louppe`_. + +- Speed improvement of the :mod:`sklearn.ensemble.gradient_boosting` module. + By `Gilles Louppe`_ and `Peter Prettenhofer`_. + +- Various enhancements to the :mod:`sklearn.ensemble.gradient_boosting` + module: a ``warm_start`` argument to fit additional trees, + a ``max_leaf_nodes`` argument to fit GBM style trees, + a ``monitor`` fit argument to inspect the estimator during training, and + refactoring of the verbose code. By `Peter Prettenhofer`_. + +- Faster :class:`sklearn.ensemble.ExtraTrees` by caching feature values. + By `Arnaud Joly`_. + +- Faster depth-based tree building algorithm such as decision tree, + random forest, extra trees or gradient tree boosting (with depth based + growing strategy) by avoiding trying to split on found constant features + in the sample subset. By `Arnaud Joly`_. + +- Add ``min_weight_fraction_leaf`` pre-pruning parameter to tree-based + methods: the minimum weighted fraction of the input samples required to be + at a leaf node. By `Noel Dawe`_. + +- Added :func:`metrics.pairwise_distances_argmin_min`, by Philippe Gervais. + +- Added predict method to :class:`cluster.AffinityPropagation` and + :class:`cluster.MeanShift`, by `Mathieu Blondel`_. + +- Vector and matrix multiplications have been optimised throughout the + library by `Denis Engemann`_, and `Alexandre Gramfort`_. + In particular, they should take less memory with older NumPy versions + (prior to 1.7.2). + +- Precision-recall and ROC examples now use train_test_split, and have more + explanation of why these metrics are useful. By `Kyle Kastner`_ + +- The training algorithm for :class:`decomposition.NMF` is faster for + sparse matrices and has much lower memory complexity, meaning it will + scale up gracefully to large datasets. By `Lars Buitinck`_. + +- Added svd_method option with default value to "randomized" to + :class:`decomposition.FactorAnalysis` to save memory and + significantly speedup computation by `Denis Engemann`_, and + `Alexandre Gramfort`_. + +- Changed :class:`cross_validation.StratifiedKFold` to try and + preserve as much of the original ordering of samples as possible so as + not to hide overfitting on datasets with a non-negligible level of + samples dependency. + By `Daniel Nouri`_ and `Olivier Grisel`_. + +- Add multi-output support to :class:`gaussian_process.GaussianProcess` + by John Novak. + +- Support for precomputed distance matrices in nearest neighbor estimators + by `Robert Layton`_ and `Joel Nothman`_. + +- Norm computations optimized for NumPy 1.6 and later versions by + `Lars Buitinck`_. In particular, the k-means algorithm no longer + needs a temporary data structure the size of its input. + +- :class:`dummy.DummyClassifier` can now be used to predict a constant + output value. By `Manoj Kumar`_. + +- :class:`dummy.DummyRegressor` has now a strategy parameter which allows + to predict the mean, the median of the training set or a constant + output value. By :user:`Maheshakya Wijewardena `. + +- Multi-label classification output in multilabel indicator format + is now supported by :func:`metrics.roc_auc_score` and + :func:`metrics.average_precision_score` by `Arnaud Joly`_. + +- Significant performance improvements (more than 100x speedup for + large problems) in :class:`isotonic.IsotonicRegression` by + `Andrew Tulloch`_. + +- Speed and memory usage improvements to the SGD algorithm for linear + models: it now uses threads, not separate processes, when ``n_jobs>1``. + By `Lars Buitinck`_. + +- Grid search and cross validation allow NaNs in the input arrays so that + preprocessors such as :class:`preprocessing.Imputer + ` can be trained within the cross validation loop, + avoiding potentially skewed results. + +- Ridge regression can now deal with sample weights in feature space + (only sample space until then). By :user:`Michael Eickenberg `. + Both solutions are provided by the Cholesky solver. + +- Several classification and regression metrics now support weighted + samples with the new ``sample_weight`` argument: + :func:`metrics.accuracy_score`, + :func:`metrics.zero_one_loss`, + :func:`metrics.precision_score`, + :func:`metrics.average_precision_score`, + :func:`metrics.f1_score`, + :func:`metrics.fbeta_score`, + :func:`metrics.recall_score`, + :func:`metrics.roc_auc_score`, + :func:`metrics.explained_variance_score`, + :func:`metrics.mean_squared_error`, + :func:`metrics.mean_absolute_error`, + :func:`metrics.r2_score`. + By `Noel Dawe`_. + +- Speed up of the sample generator + :func:`datasets.make_multilabel_classification`. By `Joel Nothman`_. + +Documentation improvements +........................... + +- The :ref:`Working With Text Data ` tutorial + has now been worked in to the main documentation's tutorial section. + Includes exercises and skeletons for tutorial presentation. + Original tutorial created by several authors including + `Olivier Grisel`_, Lars Buitinck and many others. + Tutorial integration into the scikit-learn documentation + by `Jaques Grobler`_ + +- Added :ref:`Computational Performance ` + documentation. Discussion and examples of prediction latency / throughput + and different factors that have influence over speed. Additional tips for + building faster models and choosing a relevant compromise between speed + and predictive power. + By :user:`Eustache Diemert `. + +Bug fixes +......... + +- Fixed bug in :class:`decomposition.MiniBatchDictionaryLearning` : + ``partial_fit`` was not working properly. + +- Fixed bug in :class:`linear_model.stochastic_gradient` : + ``l1_ratio`` was used as ``(1.0 - l1_ratio)`` . + +- Fixed bug in :class:`multiclass.OneVsOneClassifier` with string + labels + +- Fixed a bug in :class:`LassoCV ` and + :class:`ElasticNetCV `: they would not + pre-compute the Gram matrix with ``precompute=True`` or + ``precompute="auto"`` and ``n_samples > n_features``. By `Manoj Kumar`_. + +- Fixed incorrect estimation of the degrees of freedom in + :func:`feature_selection.f_regression` when variates are not centered. + By :user:`Virgile Fritsch `. + +- Fixed a race condition in parallel processing with + ``pre_dispatch != "all"`` (for instance, in ``cross_val_score``). + By `Olivier Grisel`_. + +- Raise error in :class:`cluster.FeatureAgglomeration` and + :class:`cluster.WardAgglomeration` when no samples are given, + rather than returning meaningless clustering. + +- Fixed bug in :class:`gradient_boosting.GradientBoostingRegressor` with + ``loss='huber'``: ``gamma`` might have not been initialized. + +- Fixed feature importances as computed with a forest of randomized trees + when fit with ``sample_weight != None`` and/or with ``bootstrap=True``. + By `Gilles Louppe`_. + +API changes summary +------------------- + +- :mod:`sklearn.hmm` is deprecated. Its removal is planned + for the 0.17 release. + +- Use of :class:`covariance.EllipticEnvelop` has now been removed after + deprecation. + Please use :class:`covariance.EllipticEnvelope` instead. + +- :class:`cluster.Ward` is deprecated. Use + :class:`cluster.AgglomerativeClustering` instead. + +- :class:`cluster.WardClustering` is deprecated. Use +- :class:`cluster.AgglomerativeClustering` instead. + +- :class:`cross_validation.Bootstrap` is deprecated. + :class:`cross_validation.KFold` or + :class:`cross_validation.ShuffleSplit` are recommended instead. + +- Direct support for the sequence of sequences (or list of lists) multilabel + format is deprecated. To convert to and from the supported binary + indicator matrix format, use + :class:`MultiLabelBinarizer `. + By `Joel Nothman`_. + +- Add score method to :class:`PCA ` following the model of + probabilistic PCA and deprecate + :class:`ProbabilisticPCA ` model whose + score implementation is not correct. The computation now also exploits the + matrix inversion lemma for faster computation. By `Alexandre Gramfort`_. + +- The score method of :class:`FactorAnalysis ` + now returns the average log-likelihood of the samples. Use score_samples + to get log-likelihood of each sample. By `Alexandre Gramfort`_. + +- Generating boolean masks (the setting ``indices=False``) + from cross-validation generators is deprecated. + Support for masks will be removed in 0.17. + The generators have produced arrays of indices by default since 0.10. + By `Joel Nothman`_. + +- 1-d arrays containing strings with ``dtype=object`` (as used in Pandas) + are now considered valid classification targets. This fixes a regression + from version 0.13 in some classifiers. By `Joel Nothman`_. + +- Fix wrong ``explained_variance_ratio_`` attribute in + :class:`RandomizedPCA `. + By `Alexandre Gramfort`_. + +- Fit alphas for each ``l1_ratio`` instead of ``mean_l1_ratio`` in + :class:`linear_model.ElasticNetCV` and :class:`linear_model.LassoCV`. + This changes the shape of ``alphas_`` from ``(n_alphas,)`` to + ``(n_l1_ratio, n_alphas)`` if the ``l1_ratio`` provided is a 1-D array like + object of length greater than one. + By `Manoj Kumar`_. + +- Fix :class:`linear_model.ElasticNetCV` and :class:`linear_model.LassoCV` + when fitting intercept and input data is sparse. The automatic grid + of alphas was not computed correctly and the scaling with normalize + was wrong. By `Manoj Kumar`_. + +- Fix wrong maximal number of features drawn (``max_features``) at each split + for decision trees, random forests and gradient tree boosting. + Previously, the count for the number of drawn features started only after + one non constant features in the split. This bug fix will affect + computational and generalization performance of those algorithms in the + presence of constant features. To get back previous generalization + performance, you should modify the value of ``max_features``. + By `Arnaud Joly`_. + +- Fix wrong maximal number of features drawn (``max_features``) at each split + for :class:`ensemble.ExtraTreesClassifier` and + :class:`ensemble.ExtraTreesRegressor`. Previously, only non constant + features in the split was counted as drawn. Now constant features are + counted as drawn. Furthermore at least one feature must be non constant + in order to make a valid split. This bug fix will affect + computational and generalization performance of extra trees in the + presence of constant features. To get back previous generalization + performance, you should modify the value of ``max_features``. + By `Arnaud Joly`_. + +- Fix :func:`utils.compute_class_weight` when ``class_weight=="auto"``. + Previously it was broken for input of non-integer ``dtype`` and the + weighted array that was returned was wrong. By `Manoj Kumar`_. + +- Fix :class:`cross_validation.Bootstrap` to return ``ValueError`` + when ``n_train + n_test > n``. By :user:`Ronald Phlypo `. + + +People +------ + +List of contributors for release 0.15 by number of commits. + +* 312 Olivier Grisel +* 275 Lars Buitinck +* 221 Gael Varoquaux +* 148 Arnaud Joly +* 134 Johannes Schönberger +* 119 Gilles Louppe +* 113 Joel Nothman +* 111 Alexandre Gramfort +* 95 Jaques Grobler +* 89 Denis Engemann +* 83 Peter Prettenhofer +* 83 Alexander Fabisch +* 62 Mathieu Blondel +* 60 Eustache Diemert +* 60 Nelle Varoquaux +* 49 Michael Bommarito +* 45 Manoj-Kumar-S +* 28 Kyle Kastner +* 26 Andreas Mueller +* 22 Noel Dawe +* 21 Maheshakya Wijewardena +* 21 Brooke Osborn +* 21 Hamzeh Alsalhi +* 21 Jake VanderPlas +* 21 Philippe Gervais +* 19 Bala Subrahmanyam Varanasi +* 12 Ronald Phlypo +* 10 Mikhail Korobov +* 8 Thomas Unterthiner +* 8 Jeffrey Blackburne +* 8 eltermann +* 8 bwignall +* 7 Ankit Agrawal +* 7 CJ Carey +* 6 Daniel Nouri +* 6 Chen Liu +* 6 Michael Eickenberg +* 6 ugurthemaster +* 5 Aaron Schumacher +* 5 Baptiste Lagarde +* 5 Rajat Khanduja +* 5 Robert McGibbon +* 5 Sergio Pascual +* 4 Alexis Metaireau +* 4 Ignacio Rossi +* 4 Virgile Fritsch +* 4 Sebastian Säger +* 4 Ilambharathi Kanniah +* 4 sdenton4 +* 4 Robert Layton +* 4 Alyssa +* 4 Amos Waterland +* 3 Andrew Tulloch +* 3 murad +* 3 Steven Maude +* 3 Karol Pysniak +* 3 Jacques Kvam +* 3 cgohlke +* 3 cjlin +* 3 Michael Becker +* 3 hamzeh +* 3 Eric Jacobsen +* 3 john collins +* 3 kaushik94 +* 3 Erwin Marsi +* 2 csytracy +* 2 LK +* 2 Vlad Niculae +* 2 Laurent Direr +* 2 Erik Shilts +* 2 Raul Garreta +* 2 Yoshiki Vázquez Baeza +* 2 Yung Siang Liau +* 2 abhishek thakur +* 2 James Yu +* 2 Rohit Sivaprasad +* 2 Roland Szabo +* 2 amormachine +* 2 Alexis Mignon +* 2 Oscar Carlsson +* 2 Nantas Nardelli +* 2 jess010 +* 2 kowalski87 +* 2 Andrew Clegg +* 2 Federico Vaggi +* 2 Simon Frid +* 2 Félix-Antoine Fortin +* 1 Ralf Gommers +* 1 t-aft +* 1 Ronan Amicel +* 1 Rupesh Kumar Srivastava +* 1 Ryan Wang +* 1 Samuel Charron +* 1 Samuel St-Jean +* 1 Fabian Pedregosa +* 1 Skipper Seabold +* 1 Stefan Walk +* 1 Stefan van der Walt +* 1 Stephan Hoyer +* 1 Allen Riddell +* 1 Valentin Haenel +* 1 Vijay Ramesh +* 1 Will Myers +* 1 Yaroslav Halchenko +* 1 Yoni Ben-Meshulam +* 1 Yury V. Zaytsev +* 1 adrinjalali +* 1 ai8rahim +* 1 alemagnani +* 1 alex +* 1 benjamin wilson +* 1 chalmerlowe +* 1 dzikie drożdże +* 1 jamestwebber +* 1 matrixorz +* 1 popo +* 1 samuela +* 1 François Boulogne +* 1 Alexander Measure +* 1 Ethan White +* 1 Guilherme Trein +* 1 Hendrik Heuer +* 1 IvicaJovic +* 1 Jan Hendrik Metzen +* 1 Jean Michel Rouly +* 1 Eduardo Ariño de la Rubia +* 1 Jelle Zijlstra +* 1 Eddy L O Jansson +* 1 Denis +* 1 John +* 1 John Schmidt +* 1 Jorge Cañardo Alastuey +* 1 Joseph Perla +* 1 Joshua Vredevoogd +* 1 José Ricardo +* 1 Julien Miotte +* 1 Kemal Eren +* 1 Kenta Sato +* 1 David Cournapeau +* 1 Kyle Kelley +* 1 Daniele Medri +* 1 Laurent Luce +* 1 Laurent Pierron +* 1 Luis Pedro Coelho +* 1 DanielWeitzenfeld +* 1 Craig Thompson +* 1 Chyi-Kwei Yau +* 1 Matthew Brett +* 1 Matthias Feurer +* 1 Max Linke +* 1 Chris Filo Gorgolewski +* 1 Charles Earl +* 1 Michael Hanke +* 1 Michele Orrù +* 1 Bryan Lunt +* 1 Brian Kearns +* 1 Paul Butler +* 1 Paweł Mandera +* 1 Peter +* 1 Andrew Ash +* 1 Pietro Zambelli +* 1 staubda + + +.. _changes_0_14: + +Version 0.14 +=============== + +**August 7, 2013** + +Changelog +--------- + +- Missing values with sparse and dense matrices can be imputed with the + transformer :class:`preprocessing.Imputer` by `Nicolas Trésegnie`_. + +- The core implementation of decisions trees has been rewritten from + scratch, allowing for faster tree induction and lower memory + consumption in all tree-based estimators. By `Gilles Louppe`_. + +- Added :class:`ensemble.AdaBoostClassifier` and + :class:`ensemble.AdaBoostRegressor`, by `Noel Dawe`_ and + `Gilles Louppe`_. See the :ref:`AdaBoost ` section of the user + guide for details and examples. + +- Added :class:`grid_search.RandomizedSearchCV` and + :class:`grid_search.ParameterSampler` for randomized hyperparameter + optimization. By `Andreas Müller`_. + +- Added :ref:`biclustering ` algorithms + (:class:`sklearn.cluster.bicluster.SpectralCoclustering` and + :class:`sklearn.cluster.bicluster.SpectralBiclustering`), data + generation methods (:func:`sklearn.datasets.make_biclusters` and + :func:`sklearn.datasets.make_checkerboard`), and scoring metrics + (:func:`sklearn.metrics.consensus_score`). By `Kemal Eren`_. + +- Added :ref:`Restricted Boltzmann Machines` + (:class:`neural_network.BernoulliRBM`). By `Yann Dauphin`_. + +- Python 3 support by :user:`Justin Vincent `, `Lars Buitinck`_, + :user:`Subhodeep Moitra ` and `Olivier Grisel`_. All tests now pass under + Python 3.3. + +- Ability to pass one penalty (alpha value) per target in + :class:`linear_model.Ridge`, by @eickenberg and `Mathieu Blondel`_. + +- Fixed :mod:`sklearn.linear_model.stochastic_gradient.py` L2 regularization + issue (minor practical significance). + By :user:`Norbert Crombach ` and `Mathieu Blondel`_ . + +- Added an interactive version of `Andreas Müller`_'s + `Machine Learning Cheat Sheet (for scikit-learn) + `_ + to the documentation. See :ref:`Choosing the right estimator `. + By `Jaques Grobler`_. + +- :class:`grid_search.GridSearchCV` and + :func:`cross_validation.cross_val_score` now support the use of advanced + scoring function such as area under the ROC curve and f-beta scores. + See :ref:`scoring_parameter` for details. By `Andreas Müller`_ + and `Lars Buitinck`_. + Passing a function from :mod:`sklearn.metrics` as ``score_func`` is + deprecated. + +- Multi-label classification output is now supported by + :func:`metrics.accuracy_score`, :func:`metrics.zero_one_loss`, + :func:`metrics.f1_score`, :func:`metrics.fbeta_score`, + :func:`metrics.classification_report`, + :func:`metrics.precision_score` and :func:`metrics.recall_score` + by `Arnaud Joly`_. + +- Two new metrics :func:`metrics.hamming_loss` and + :func:`metrics.jaccard_similarity_score` + are added with multi-label support by `Arnaud Joly`_. + +- Speed and memory usage improvements in + :class:`feature_extraction.text.CountVectorizer` and + :class:`feature_extraction.text.TfidfVectorizer`, + by Jochen Wersdörfer and Roman Sinayev. + +- The ``min_df`` parameter in + :class:`feature_extraction.text.CountVectorizer` and + :class:`feature_extraction.text.TfidfVectorizer`, which used to be 2, + has been reset to 1 to avoid unpleasant surprises (empty vocabularies) + for novice users who try it out on tiny document collections. + A value of at least 2 is still recommended for practical use. + +- :class:`svm.LinearSVC`, :class:`linear_model.SGDClassifier` and + :class:`linear_model.SGDRegressor` now have a ``sparsify`` method that + converts their ``coef_`` into a sparse matrix, meaning stored models + trained using these estimators can be made much more compact. + +- :class:`linear_model.SGDClassifier` now produces multiclass probability + estimates when trained under log loss or modified Huber loss. + +- Hyperlinks to documentation in example code on the website by + :user:`Martin Luessi `. + +- Fixed bug in :class:`preprocessing.MinMaxScaler` causing incorrect scaling + of the features for non-default ``feature_range`` settings. By `Andreas + Müller`_. + +- ``max_features`` in :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor` and all derived ensemble estimators + now supports percentage values. By `Gilles Louppe`_. + +- Performance improvements in :class:`isotonic.IsotonicRegression` by + `Nelle Varoquaux`_. + +- :func:`metrics.accuracy_score` has an option normalize to return + the fraction or the number of correctly classified sample + by `Arnaud Joly`_. + +- Added :func:`metrics.log_loss` that computes log loss, aka cross-entropy + loss. By Jochen Wersdörfer and `Lars Buitinck`_. + +- A bug that caused :class:`ensemble.AdaBoostClassifier`'s to output + incorrect probabilities has been fixed. + +- Feature selectors now share a mixin providing consistent ``transform``, + ``inverse_transform`` and ``get_support`` methods. By `Joel Nothman`_. + +- A fitted :class:`grid_search.GridSearchCV` or + :class:`grid_search.RandomizedSearchCV` can now generally be pickled. + By `Joel Nothman`_. + +- Refactored and vectorized implementation of :func:`metrics.roc_curve` + and :func:`metrics.precision_recall_curve`. By `Joel Nothman`_. + +- The new estimator :class:`sklearn.decomposition.TruncatedSVD` + performs dimensionality reduction using SVD on sparse matrices, + and can be used for latent semantic analysis (LSA). + By `Lars Buitinck`_. + +- Added self-contained example of out-of-core learning on text data + :ref:`sphx_glr_auto_examples_applications_plot_out_of_core_classification.py`. + By :user:`Eustache Diemert `. + +- The default number of components for + :class:`sklearn.decomposition.RandomizedPCA` is now correctly documented + to be ``n_features``. This was the default behavior, so programs using it + will continue to work as they did. + +- :class:`sklearn.cluster.KMeans` now fits several orders of magnitude + faster on sparse data (the speedup depends on the sparsity). By + `Lars Buitinck`_. + +- Reduce memory footprint of FastICA by `Denis Engemann`_ and + `Alexandre Gramfort`_. + +- Verbose output in :mod:`sklearn.ensemble.gradient_boosting` now uses + a column format and prints progress in decreasing frequency. + It also shows the remaining time. By `Peter Prettenhofer`_. + +- :mod:`sklearn.ensemble.gradient_boosting` provides out-of-bag improvement + :attr:`~sklearn.ensemble.GradientBoostingRegressor.oob_improvement_` + rather than the OOB score for model selection. An example that shows + how to use OOB estimates to select the number of trees was added. + By `Peter Prettenhofer`_. + +- Most metrics now support string labels for multiclass classification + by `Arnaud Joly`_ and `Lars Buitinck`_. + +- New OrthogonalMatchingPursuitCV class by `Alexandre Gramfort`_ + and `Vlad Niculae`_. + +- Fixed a bug in :class:`sklearn.covariance.GraphLassoCV`: the + 'alphas' parameter now works as expected when given a list of + values. By Philippe Gervais. + +- Fixed an important bug in :class:`sklearn.covariance.GraphLassoCV` + that prevented all folds provided by a CV object to be used (only + the first 3 were used). When providing a CV object, execution + time may thus increase significantly compared to the previous + version (bug results are correct now). By Philippe Gervais. + +- :class:`cross_validation.cross_val_score` and the :mod:`grid_search` + module is now tested with multi-output data by `Arnaud Joly`_. + +- :func:`datasets.make_multilabel_classification` can now return + the output in label indicator multilabel format by `Arnaud Joly`_. + +- K-nearest neighbors, :class:`neighbors.KNeighborsRegressor` + and :class:`neighbors.RadiusNeighborsRegressor`, + and radius neighbors, :class:`neighbors.RadiusNeighborsRegressor` and + :class:`neighbors.RadiusNeighborsClassifier` support multioutput data + by `Arnaud Joly`_. + +- Random state in LibSVM-based estimators (:class:`svm.SVC`, :class:`NuSVC`, + :class:`OneClassSVM`, :class:`svm.SVR`, :class:`svm.NuSVR`) can now be + controlled. This is useful to ensure consistency in the probability + estimates for the classifiers trained with ``probability=True``. By + `Vlad Niculae`_. + +- Out-of-core learning support for discrete naive Bayes classifiers + :class:`sklearn.naive_bayes.MultinomialNB` and + :class:`sklearn.naive_bayes.BernoulliNB` by adding the ``partial_fit`` + method by `Olivier Grisel`_. + +- New website design and navigation by `Gilles Louppe`_, `Nelle Varoquaux`_, + Vincent Michel and `Andreas Müller`_. + +- Improved documentation on :ref:`multi-class, multi-label and multi-output + classification ` by `Yannick Schwartz`_ and `Arnaud Joly`_. + +- Better input and error handling in the :mod:`metrics` module by + `Arnaud Joly`_ and `Joel Nothman`_. + +- Speed optimization of the :mod:`hmm` module by :user:`Mikhail Korobov ` + +- Significant speed improvements for :class:`sklearn.cluster.DBSCAN` + by `cleverless `_ + + +API changes summary +------------------- + +- The :func:`auc_score` was renamed :func:`roc_auc_score`. + +- Testing scikit-learn with ``sklearn.test()`` is deprecated. Use + ``nosetests sklearn`` from the command line. + +- Feature importances in :class:`tree.DecisionTreeClassifier`, + :class:`tree.DecisionTreeRegressor` and all derived ensemble estimators + are now computed on the fly when accessing the ``feature_importances_`` + attribute. Setting ``compute_importances=True`` is no longer required. + By `Gilles Louppe`_. + +- :class:`linear_model.lasso_path` and + :class:`linear_model.enet_path` can return its results in the same + format as that of :class:`linear_model.lars_path`. This is done by + setting the ``return_models`` parameter to ``False``. By + `Jaques Grobler`_ and `Alexandre Gramfort`_ + +- :class:`grid_search.IterGrid` was renamed to + :class:`grid_search.ParameterGrid`. + +- Fixed bug in :class:`KFold` causing imperfect class balance in some + cases. By `Alexandre Gramfort`_ and Tadej Janež. + +- :class:`sklearn.neighbors.BallTree` has been refactored, and a + :class:`sklearn.neighbors.KDTree` has been + added which shares the same interface. The Ball Tree now works with + a wide variety of distance metrics. Both classes have many new + methods, including single-tree and dual-tree queries, breadth-first + and depth-first searching, and more advanced queries such as + kernel density estimation and 2-point correlation functions. + By `Jake Vanderplas`_ + +- Support for scipy.spatial.cKDTree within neighbors queries has been + removed, and the functionality replaced with the new :class:`KDTree` + class. + +- :class:`sklearn.neighbors.KernelDensity` has been added, which performs + efficient kernel density estimation with a variety of kernels. + +- :class:`sklearn.decomposition.KernelPCA` now always returns output with + ``n_components`` components, unless the new parameter ``remove_zero_eig`` + is set to ``True``. This new behavior is consistent with the way + kernel PCA was always documented; previously, the removal of components + with zero eigenvalues was tacitly performed on all data. + +- ``gcv_mode="auto"`` no longer tries to perform SVD on a densified + sparse matrix in :class:`sklearn.linear_model.RidgeCV`. + +- Sparse matrix support in :class:`sklearn.decomposition.RandomizedPCA` + is now deprecated in favor of the new ``TruncatedSVD``. + +- :class:`cross_validation.KFold` and + :class:`cross_validation.StratifiedKFold` now enforce `n_folds >= 2` + otherwise a ``ValueError`` is raised. By `Olivier Grisel`_. + +- :func:`datasets.load_files`'s ``charset`` and ``charset_errors`` + parameters were renamed ``encoding`` and ``decode_errors``. + +- Attribute ``oob_score_`` in :class:`sklearn.ensemble.GradientBoostingRegressor` + and :class:`sklearn.ensemble.GradientBoostingClassifier` + is deprecated and has been replaced by ``oob_improvement_`` . + +- Attributes in OrthogonalMatchingPursuit have been deprecated + (copy_X, Gram, ...) and precompute_gram renamed precompute + for consistency. See #2224. + +- :class:`sklearn.preprocessing.StandardScaler` now converts integer input + to float, and raises a warning. Previously it rounded for dense integer + input. + +- :class:`sklearn.multiclass.OneVsRestClassifier` now has a + ``decision_function`` method. This will return the distance of each + sample from the decision boundary for each class, as long as the + underlying estimators implement the ``decision_function`` method. + By `Kyle Kastner`_. + +- Better input validation, warning on unexpected shapes for y. + +People +------ +List of contributors for release 0.14 by number of commits. + + * 277 Gilles Louppe + * 245 Lars Buitinck + * 187 Andreas Mueller + * 124 Arnaud Joly + * 112 Jaques Grobler + * 109 Gael Varoquaux + * 107 Olivier Grisel + * 102 Noel Dawe + * 99 Kemal Eren + * 79 Joel Nothman + * 75 Jake VanderPlas + * 73 Nelle Varoquaux + * 71 Vlad Niculae + * 65 Peter Prettenhofer + * 64 Alexandre Gramfort + * 54 Mathieu Blondel + * 38 Nicolas Trésegnie + * 35 eustache + * 27 Denis Engemann + * 25 Yann N. Dauphin + * 19 Justin Vincent + * 17 Robert Layton + * 15 Doug Coleman + * 14 Michael Eickenberg + * 13 Robert Marchman + * 11 Fabian Pedregosa + * 11 Philippe Gervais + * 10 Jim Holmström + * 10 Tadej Janež + * 10 syhw + * 9 Mikhail Korobov + * 9 Steven De Gryze + * 8 sergeyf + * 7 Ben Root + * 7 Hrishikesh Huilgolkar + * 6 Kyle Kastner + * 6 Martin Luessi + * 6 Rob Speer + * 5 Federico Vaggi + * 5 Raul Garreta + * 5 Rob Zinkov + * 4 Ken Geis + * 3 A. Flaxman + * 3 Denton Cockburn + * 3 Dougal Sutherland + * 3 Ian Ozsvald + * 3 Johannes Schönberger + * 3 Robert McGibbon + * 3 Roman Sinayev + * 3 Szabo Roland + * 2 Diego Molla + * 2 Imran Haque + * 2 Jochen Wersdörfer + * 2 Sergey Karayev + * 2 Yannick Schwartz + * 2 jamestwebber + * 1 Abhijeet Kolhe + * 1 Alexander Fabisch + * 1 Bastiaan van den Berg + * 1 Benjamin Peterson + * 1 Daniel Velkov + * 1 Fazlul Shahriar + * 1 Felix Brockherde + * 1 Félix-Antoine Fortin + * 1 Harikrishnan S + * 1 Jack Hale + * 1 JakeMick + * 1 James McDermott + * 1 John Benediktsson + * 1 John Zwinck + * 1 Joshua Vredevoogd + * 1 Justin Pati + * 1 Kevin Hughes + * 1 Kyle Kelley + * 1 Matthias Ekman + * 1 Miroslav Shubernetskiy + * 1 Naoki Orii + * 1 Norbert Crombach + * 1 Rafael Cunha de Almeida + * 1 Rolando Espinoza La fuente + * 1 Seamus Abshere + * 1 Sergey Feldman + * 1 Sergio Medina + * 1 Stefano Lattarini + * 1 Steve Koch + * 1 Sturla Molden + * 1 Thomas Jarosch + * 1 Yaroslav Halchenko + +.. _changes_0_13_1: + +Version 0.13.1 +============== + +**February 23, 2013** + +The 0.13.1 release only fixes some bugs and does not add any new functionality. + +Changelog +--------- + +- Fixed a testing error caused by the function :func:`cross_validation.train_test_split` being + interpreted as a test by `Yaroslav Halchenko`_. + +- Fixed a bug in the reassignment of small clusters in the :class:`cluster.MiniBatchKMeans` + by `Gael Varoquaux`_. + +- Fixed default value of ``gamma`` in :class:`decomposition.KernelPCA` by `Lars Buitinck`_. + +- Updated joblib to ``0.7.0d`` by `Gael Varoquaux`_. + +- Fixed scaling of the deviance in :class:`ensemble.GradientBoostingClassifier` by `Peter Prettenhofer`_. + +- Better tie-breaking in :class:`multiclass.OneVsOneClassifier` by `Andreas Müller`_. + +- Other small improvements to tests and documentation. + +People +------ +List of contributors for release 0.13.1 by number of commits. + * 16 `Lars Buitinck`_ + * 12 `Andreas Müller`_ + * 8 `Gael Varoquaux`_ + * 5 Robert Marchman + * 3 `Peter Prettenhofer`_ + * 2 Hrishikesh Huilgolkar + * 1 Bastiaan van den Berg + * 1 Diego Molla + * 1 `Gilles Louppe`_ + * 1 `Mathieu Blondel`_ + * 1 `Nelle Varoquaux`_ + * 1 Rafael Cunha de Almeida + * 1 Rolando Espinoza La fuente + * 1 `Vlad Niculae`_ + * 1 `Yaroslav Halchenko`_ + + +.. _changes_0_13: + +Version 0.13 +============ + +**January 21, 2013** + +New Estimator Classes +--------------------- + +- :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor`, two + data-independent predictors by `Mathieu Blondel`_. Useful to sanity-check + your estimators. See :ref:`dummy_estimators` in the user guide. + Multioutput support added by `Arnaud Joly`_. + +- :class:`decomposition.FactorAnalysis`, a transformer implementing the + classical factor analysis, by `Christian Osendorfer`_ and `Alexandre + Gramfort`_. See :ref:`FA` in the user guide. + +- :class:`feature_extraction.FeatureHasher`, a transformer implementing the + "hashing trick" for fast, low-memory feature extraction from string fields + by `Lars Buitinck`_ and :class:`feature_extraction.text.HashingVectorizer` + for text documents by `Olivier Grisel`_ See :ref:`feature_hashing` and + :ref:`hashing_vectorizer` for the documentation and sample usage. + +- :class:`pipeline.FeatureUnion`, a transformer that concatenates + results of several other transformers by `Andreas Müller`_. See + :ref:`feature_union` in the user guide. + +- :class:`random_projection.GaussianRandomProjection`, + :class:`random_projection.SparseRandomProjection` and the function + :func:`random_projection.johnson_lindenstrauss_min_dim`. The first two are + transformers implementing Gaussian and sparse random projection matrix + by `Olivier Grisel`_ and `Arnaud Joly`_. + See :ref:`random_projection` in the user guide. + +- :class:`kernel_approximation.Nystroem`, a transformer for approximating + arbitrary kernels by `Andreas Müller`_. See + :ref:`nystroem_kernel_approx` in the user guide. + +- :class:`preprocessing.OneHotEncoder`, a transformer that computes binary + encodings of categorical features by `Andreas Müller`_. See + :ref:`preprocessing_categorical_features` in the user guide. + +- :class:`linear_model.PassiveAggressiveClassifier` and + :class:`linear_model.PassiveAggressiveRegressor`, predictors implementing + an efficient stochastic optimization for linear models by `Rob Zinkov`_ and + `Mathieu Blondel`_. See :ref:`passive_aggressive` in the user + guide. + +- :class:`ensemble.RandomTreesEmbedding`, a transformer for creating high-dimensional + sparse representations using ensembles of totally random trees by `Andreas Müller`_. + See :ref:`random_trees_embedding` in the user guide. + +- :class:`manifold.SpectralEmbedding` and function + :func:`manifold.spectral_embedding`, implementing the "laplacian + eigenmaps" transformation for non-linear dimensionality reduction by Wei + Li. See :ref:`spectral_embedding` in the user guide. + +- :class:`isotonic.IsotonicRegression` by `Fabian Pedregosa`_, `Alexandre Gramfort`_ + and `Nelle Varoquaux`_, + + +Changelog +--------- + +- :func:`metrics.zero_one_loss` (formerly ``metrics.zero_one``) now has + option for normalized output that reports the fraction of + misclassifications, rather than the raw number of misclassifications. By + Kyle Beauchamp. + +- :class:`tree.DecisionTreeClassifier` and all derived ensemble models now + support sample weighting, by `Noel Dawe`_ and `Gilles Louppe`_. + +- Speedup improvement when using bootstrap samples in forests of randomized + trees, by `Peter Prettenhofer`_ and `Gilles Louppe`_. + +- Partial dependence plots for :ref:`gradient_boosting` in + :func:`ensemble.partial_dependence.partial_dependence` by `Peter + Prettenhofer`_. See :ref:`sphx_glr_auto_examples_ensemble_plot_partial_dependence.py` for an + example. + +- The table of contents on the website has now been made expandable by + `Jaques Grobler`_. + +- :class:`feature_selection.SelectPercentile` now breaks ties + deterministically instead of returning all equally ranked features. + +- :class:`feature_selection.SelectKBest` and + :class:`feature_selection.SelectPercentile` are more numerically stable + since they use scores, rather than p-values, to rank results. This means + that they might sometimes select different features than they did + previously. + +- Ridge regression and ridge classification fitting with ``sparse_cg`` solver + no longer has quadratic memory complexity, by `Lars Buitinck`_ and + `Fabian Pedregosa`_. + +- Ridge regression and ridge classification now support a new fast solver + called ``lsqr``, by `Mathieu Blondel`_. + +- Speed up of :func:`metrics.precision_recall_curve` by Conrad Lee. + +- Added support for reading/writing svmlight files with pairwise + preference attribute (qid in svmlight file format) in + :func:`datasets.dump_svmlight_file` and + :func:`datasets.load_svmlight_file` by `Fabian Pedregosa`_. + +- Faster and more robust :func:`metrics.confusion_matrix` and + :ref:`clustering_evaluation` by Wei Li. + +- :func:`cross_validation.cross_val_score` now works with precomputed kernels + and affinity matrices, by `Andreas Müller`_. + +- LARS algorithm made more numerically stable with heuristics to drop + regressors too correlated as well as to stop the path when + numerical noise becomes predominant, by `Gael Varoquaux`_. + +- Faster implementation of :func:`metrics.precision_recall_curve` by + Conrad Lee. + +- New kernel :class:`metrics.chi2_kernel` by `Andreas Müller`_, often used + in computer vision applications. + +- Fix of longstanding bug in :class:`naive_bayes.BernoulliNB` fixed by + Shaun Jackman. + +- Implemented ``predict_proba`` in :class:`multiclass.OneVsRestClassifier`, + by Andrew Winterman. + +- Improve consistency in gradient boosting: estimators + :class:`ensemble.GradientBoostingRegressor` and + :class:`ensemble.GradientBoostingClassifier` use the estimator + :class:`tree.DecisionTreeRegressor` instead of the + :class:`tree._tree.Tree` data structure by `Arnaud Joly`_. + +- Fixed a floating point exception in the :ref:`decision trees ` + module, by Seberg. + +- Fix :func:`metrics.roc_curve` fails when y_true has only one class + by Wei Li. + +- Add the :func:`metrics.mean_absolute_error` function which computes the + mean absolute error. The :func:`metrics.mean_squared_error`, + :func:`metrics.mean_absolute_error` and + :func:`metrics.r2_score` metrics support multioutput by `Arnaud Joly`_. + +- Fixed ``class_weight`` support in :class:`svm.LinearSVC` and + :class:`linear_model.LogisticRegression` by `Andreas Müller`_. The meaning + of ``class_weight`` was reversed as erroneously higher weight meant less + positives of a given class in earlier releases. + +- Improve narrative documentation and consistency in + :mod:`sklearn.metrics` for regression and classification metrics + by `Arnaud Joly`_. + +- Fixed a bug in :class:`sklearn.svm.SVC` when using csr-matrices with + unsorted indices by Xinfan Meng and `Andreas Müller`_. + +- :class:`MiniBatchKMeans`: Add random reassignment of cluster centers + with little observations attached to them, by `Gael Varoquaux`_. + + +API changes summary +------------------- +- Renamed all occurrences of ``n_atoms`` to ``n_components`` for consistency. + This applies to :class:`decomposition.DictionaryLearning`, + :class:`decomposition.MiniBatchDictionaryLearning`, + :func:`decomposition.dict_learning`, :func:`decomposition.dict_learning_online`. + +- Renamed all occurrences of ``max_iters`` to ``max_iter`` for consistency. + This applies to :class:`semi_supervised.LabelPropagation` and + :class:`semi_supervised.label_propagation.LabelSpreading`. + +- Renamed all occurrences of ``learn_rate`` to ``learning_rate`` for + consistency in :class:`ensemble.BaseGradientBoosting` and + :class:`ensemble.GradientBoostingRegressor`. + +- The module ``sklearn.linear_model.sparse`` is gone. Sparse matrix support + was already integrated into the "regular" linear models. + +- :func:`sklearn.metrics.mean_square_error`, which incorrectly returned the + accumulated error, was removed. Use ``mean_squared_error`` instead. + +- Passing ``class_weight`` parameters to ``fit`` methods is no longer + supported. Pass them to estimator constructors instead. + +- GMMs no longer have ``decode`` and ``rvs`` methods. Use the ``score``, + ``predict`` or ``sample`` methods instead. + +- The ``solver`` fit option in Ridge regression and classification is now + deprecated and will be removed in v0.14. Use the constructor option + instead. + +- :class:`feature_extraction.text.DictVectorizer` now returns sparse + matrices in the CSR format, instead of COO. + +- Renamed ``k`` in :class:`cross_validation.KFold` and + :class:`cross_validation.StratifiedKFold` to ``n_folds``, renamed + ``n_bootstraps`` to ``n_iter`` in ``cross_validation.Bootstrap``. + +- Renamed all occurrences of ``n_iterations`` to ``n_iter`` for consistency. + This applies to :class:`cross_validation.ShuffleSplit`, + :class:`cross_validation.StratifiedShuffleSplit`, + :func:`utils.randomized_range_finder` and :func:`utils.randomized_svd`. + +- Replaced ``rho`` in :class:`linear_model.ElasticNet` and + :class:`linear_model.SGDClassifier` by ``l1_ratio``. The ``rho`` parameter + had different meanings; ``l1_ratio`` was introduced to avoid confusion. + It has the same meaning as previously ``rho`` in + :class:`linear_model.ElasticNet` and ``(1-rho)`` in + :class:`linear_model.SGDClassifier`. + +- :class:`linear_model.LassoLars` and :class:`linear_model.Lars` now + store a list of paths in the case of multiple targets, rather than + an array of paths. + +- The attribute ``gmm`` of :class:`hmm.GMMHMM` was renamed to ``gmm_`` + to adhere more strictly with the API. + +- :func:`cluster.spectral_embedding` was moved to + :func:`manifold.spectral_embedding`. + +- Renamed ``eig_tol`` in :func:`manifold.spectral_embedding`, + :class:`cluster.SpectralClustering` to ``eigen_tol``, renamed ``mode`` + to ``eigen_solver``. + +- Renamed ``mode`` in :func:`manifold.spectral_embedding` and + :class:`cluster.SpectralClustering` to ``eigen_solver``. + +- ``classes_`` and ``n_classes_`` attributes of + :class:`tree.DecisionTreeClassifier` and all derived ensemble models are + now flat in case of single output problems and nested in case of + multi-output problems. + +- The ``estimators_`` attribute of + :class:`ensemble.gradient_boosting.GradientBoostingRegressor` and + :class:`ensemble.gradient_boosting.GradientBoostingClassifier` is now an + array of :class:'tree.DecisionTreeRegressor'. + +- Renamed ``chunk_size`` to ``batch_size`` in + :class:`decomposition.MiniBatchDictionaryLearning` and + :class:`decomposition.MiniBatchSparsePCA` for consistency. + +- :class:`svm.SVC` and :class:`svm.NuSVC` now provide a ``classes_`` + attribute and support arbitrary dtypes for labels ``y``. + Also, the dtype returned by ``predict`` now reflects the dtype of + ``y`` during ``fit`` (used to be ``np.float``). + +- Changed default test_size in :func:`cross_validation.train_test_split` + to None, added possibility to infer ``test_size`` from ``train_size`` in + :class:`cross_validation.ShuffleSplit` and + :class:`cross_validation.StratifiedShuffleSplit`. + +- Renamed function :func:`sklearn.metrics.zero_one` to + :func:`sklearn.metrics.zero_one_loss`. Be aware that the default behavior + in :func:`sklearn.metrics.zero_one_loss` is different from + :func:`sklearn.metrics.zero_one`: ``normalize=False`` is changed to + ``normalize=True``. + +- Renamed function :func:`metrics.zero_one_score` to + :func:`metrics.accuracy_score`. + +- :func:`datasets.make_circles` now has the same number of inner and outer points. + +- In the Naive Bayes classifiers, the ``class_prior`` parameter was moved + from ``fit`` to ``__init__``. + +People +------ +List of contributors for release 0.13 by number of commits. + + * 364 `Andreas Müller`_ + * 143 `Arnaud Joly`_ + * 137 `Peter Prettenhofer`_ + * 131 `Gael Varoquaux`_ + * 117 `Mathieu Blondel`_ + * 108 `Lars Buitinck`_ + * 106 Wei Li + * 101 `Olivier Grisel`_ + * 65 `Vlad Niculae`_ + * 54 `Gilles Louppe`_ + * 40 `Jaques Grobler`_ + * 38 `Alexandre Gramfort`_ + * 30 `Rob Zinkov`_ + * 19 Aymeric Masurelle + * 18 Andrew Winterman + * 17 `Fabian Pedregosa`_ + * 17 Nelle Varoquaux + * 16 `Christian Osendorfer`_ + * 14 `Daniel Nouri`_ + * 13 :user:`Virgile Fritsch ` + * 13 syhw + * 12 `Satrajit Ghosh`_ + * 10 Corey Lynch + * 10 Kyle Beauchamp + * 9 Brian Cheung + * 9 Immanuel Bayer + * 9 mr.Shu + * 8 Conrad Lee + * 8 `James Bergstra`_ + * 7 Tadej Janež + * 6 Brian Cajes + * 6 `Jake Vanderplas`_ + * 6 Michael + * 6 Noel Dawe + * 6 Tiago Nunes + * 6 cow + * 5 Anze + * 5 Shiqiao Du + * 4 Christian Jauvin + * 4 Jacques Kvam + * 4 Richard T. Guy + * 4 `Robert Layton`_ + * 3 Alexandre Abraham + * 3 Doug Coleman + * 3 Scott Dickerson + * 2 ApproximateIdentity + * 2 John Benediktsson + * 2 Mark Veronda + * 2 Matti Lyra + * 2 Mikhail Korobov + * 2 Xinfan Meng + * 1 Alejandro Weinstein + * 1 `Alexandre Passos`_ + * 1 Christoph Deil + * 1 Eugene Nizhibitsky + * 1 Kenneth C. Arnold + * 1 Luis Pedro Coelho + * 1 Miroslav Batchkarov + * 1 Pavel + * 1 Sebastian Berg + * 1 Shaun Jackman + * 1 Subhodeep Moitra + * 1 bob + * 1 dengemann + * 1 emanuele + * 1 x006 + + +.. _changes_0_12.1: + +Version 0.12.1 +=============== + +**October 8, 2012** + +The 0.12.1 release is a bug-fix release with no additional features, but is +instead a set of bug fixes + +Changelog +---------- + +- Improved numerical stability in spectral embedding by `Gael + Varoquaux`_ + +- Doctest under windows 64bit by `Gael Varoquaux`_ + +- Documentation fixes for elastic net by `Andreas Müller`_ and + `Alexandre Gramfort`_ + +- Proper behavior with fortran-ordered NumPy arrays by `Gael Varoquaux`_ + +- Make GridSearchCV work with non-CSR sparse matrix by `Lars Buitinck`_ + +- Fix parallel computing in MDS by `Gael Varoquaux`_ + +- Fix Unicode support in count vectorizer by `Andreas Müller`_ + +- Fix MinCovDet breaking with X.shape = (3, 1) by :user:`Virgile Fritsch ` + +- Fix clone of SGD objects by `Peter Prettenhofer`_ + +- Stabilize GMM by :user:`Virgile Fritsch ` + +People +------ + + * 14 `Peter Prettenhofer`_ + * 12 `Gael Varoquaux`_ + * 10 `Andreas Müller`_ + * 5 `Lars Buitinck`_ + * 3 :user:`Virgile Fritsch ` + * 1 `Alexandre Gramfort`_ + * 1 `Gilles Louppe`_ + * 1 `Mathieu Blondel`_ + +.. _changes_0_12: + +Version 0.12 +============ + +**September 4, 2012** + +Changelog +--------- + +- Various speed improvements of the :ref:`decision trees ` module, by + `Gilles Louppe`_. + +- :class:`ensemble.GradientBoostingRegressor` and + :class:`ensemble.GradientBoostingClassifier` now support feature subsampling + via the ``max_features`` argument, by `Peter Prettenhofer`_. + +- Added Huber and Quantile loss functions to + :class:`ensemble.GradientBoostingRegressor`, by `Peter Prettenhofer`_. + +- :ref:`Decision trees ` and :ref:`forests of randomized trees ` + now support multi-output classification and regression problems, by + `Gilles Louppe`_. + +- Added :class:`preprocessing.LabelEncoder`, a simple utility class to + normalize labels or transform non-numerical labels, by `Mathieu Blondel`_. + +- Added the epsilon-insensitive loss and the ability to make probabilistic + predictions with the modified huber loss in :ref:`sgd`, by + `Mathieu Blondel`_. + +- Added :ref:`multidimensional_scaling`, by Nelle Varoquaux. + +- SVMlight file format loader now detects compressed (gzip/bzip2) files and + decompresses them on the fly, by `Lars Buitinck`_. + +- SVMlight file format serializer now preserves double precision floating + point values, by `Olivier Grisel`_. + +- A common testing framework for all estimators was added, by `Andreas Müller`_. + +- Understandable error messages for estimators that do not accept + sparse input by `Gael Varoquaux`_ + +- Speedups in hierarchical clustering by `Gael Varoquaux`_. In + particular building the tree now supports early stopping. This is + useful when the number of clusters is not small compared to the + number of samples. + +- Add MultiTaskLasso and MultiTaskElasticNet for joint feature selection, + by `Alexandre Gramfort`_. + +- Added :func:`metrics.auc_score` and + :func:`metrics.average_precision_score` convenience functions by `Andreas + Müller`_. + +- Improved sparse matrix support in the :ref:`feature_selection` + module by `Andreas Müller`_. + +- New word boundaries-aware character n-gram analyzer for the + :ref:`text_feature_extraction` module by :user:`@kernc `. + +- Fixed bug in spectral clustering that led to single point clusters + by `Andreas Müller`_. + +- In :class:`feature_extraction.text.CountVectorizer`, added an option to + ignore infrequent words, ``min_df`` by `Andreas Müller`_. + +- Add support for multiple targets in some linear models (ElasticNet, Lasso + and OrthogonalMatchingPursuit) by `Vlad Niculae`_ and + `Alexandre Gramfort`_. + +- Fixes in :class:`decomposition.ProbabilisticPCA` score function by Wei Li. + +- Fixed feature importance computation in + :ref:`gradient_boosting`. + +API changes summary +------------------- + +- The old ``scikits.learn`` package has disappeared; all code should import + from ``sklearn`` instead, which was introduced in 0.9. + +- In :func:`metrics.roc_curve`, the ``thresholds`` array is now returned + with it's order reversed, in order to keep it consistent with the order + of the returned ``fpr`` and ``tpr``. + +- In :class:`hmm` objects, like :class:`hmm.GaussianHMM`, + :class:`hmm.MultinomialHMM`, etc., all parameters must be passed to the + object when initialising it and not through ``fit``. Now ``fit`` will + only accept the data as an input parameter. + +- For all SVM classes, a faulty behavior of ``gamma`` was fixed. Previously, + the default gamma value was only computed the first time ``fit`` was called + and then stored. It is now recalculated on every call to ``fit``. + +- All ``Base`` classes are now abstract meta classes so that they can not be + instantiated. + +- :func:`cluster.ward_tree` now also returns the parent array. This is + necessary for early-stopping in which case the tree is not + completely built. + +- In :class:`feature_extraction.text.CountVectorizer` the parameters + ``min_n`` and ``max_n`` were joined to the parameter ``n_gram_range`` to + enable grid-searching both at once. + +- In :class:`feature_extraction.text.CountVectorizer`, words that appear + only in one document are now ignored by default. To reproduce + the previous behavior, set ``min_df=1``. + +- Fixed API inconsistency: :meth:`linear_model.SGDClassifier.predict_proba` now + returns 2d array when fit on two classes. + +- Fixed API inconsistency: :meth:`discriminant_analysis.QuadraticDiscriminantAnalysis.decision_function` + and :meth:`discriminant_analysis.LinearDiscriminantAnalysis.decision_function` now return 1d arrays + when fit on two classes. + +- Grid of alphas used for fitting :class:`linear_model.LassoCV` and + :class:`linear_model.ElasticNetCV` is now stored + in the attribute ``alphas_`` rather than overriding the init parameter + ``alphas``. + +- Linear models when alpha is estimated by cross-validation store + the estimated value in the ``alpha_`` attribute rather than just + ``alpha`` or ``best_alpha``. + +- :class:`ensemble.GradientBoostingClassifier` now supports + :meth:`ensemble.GradientBoostingClassifier.staged_predict_proba`, and + :meth:`ensemble.GradientBoostingClassifier.staged_predict`. + +- :class:`svm.sparse.SVC` and other sparse SVM classes are now deprecated. + The all classes in the :ref:`svm` module now automatically select the + sparse or dense representation base on the input. + +- All clustering algorithms now interpret the array ``X`` given to ``fit`` as + input data, in particular :class:`cluster.SpectralClustering` and + :class:`cluster.AffinityPropagation` which previously expected affinity matrices. + +- For clustering algorithms that take the desired number of clusters as a parameter, + this parameter is now called ``n_clusters``. + + +People +------ + * 267 `Andreas Müller`_ + * 94 `Gilles Louppe`_ + * 89 `Gael Varoquaux`_ + * 79 `Peter Prettenhofer`_ + * 60 `Mathieu Blondel`_ + * 57 `Alexandre Gramfort`_ + * 52 `Vlad Niculae`_ + * 45 `Lars Buitinck`_ + * 44 Nelle Varoquaux + * 37 `Jaques Grobler`_ + * 30 Alexis Mignon + * 30 Immanuel Bayer + * 27 `Olivier Grisel`_ + * 16 Subhodeep Moitra + * 13 Yannick Schwartz + * 12 :user:`@kernc ` + * 11 :user:`Virgile Fritsch ` + * 9 Daniel Duckworth + * 9 `Fabian Pedregosa`_ + * 9 `Robert Layton`_ + * 8 John Benediktsson + * 7 Marko Burjek + * 5 `Nicolas Pinto`_ + * 4 Alexandre Abraham + * 4 `Jake Vanderplas`_ + * 3 `Brian Holt`_ + * 3 `Edouard Duchesnay`_ + * 3 Florian Hoenig + * 3 flyingimmidev + * 2 Francois Savard + * 2 Hannes Schulz + * 2 Peter Welinder + * 2 `Yaroslav Halchenko`_ + * 2 Wei Li + * 1 Alex Companioni + * 1 Brandyn A. White + * 1 Bussonnier Matthias + * 1 Charles-Pierre Astolfi + * 1 Dan O'Huiginn + * 1 David Cournapeau + * 1 Keith Goodman + * 1 Ludwig Schwardt + * 1 Olivier Hervieu + * 1 Sergio Medina + * 1 Shiqiao Du + * 1 Tim Sheerman-Chase + * 1 buguen + + + +.. _changes_0_11: + +Version 0.11 +============ + +**May 7, 2012** + +Changelog +--------- + +Highlights +............. + +- Gradient boosted regression trees (:ref:`gradient_boosting`) + for classification and regression by `Peter Prettenhofer`_ + and `Scott White`_ . + +- Simple dict-based feature loader with support for categorical variables + (:class:`feature_extraction.DictVectorizer`) by `Lars Buitinck`_. + +- Added Matthews correlation coefficient (:func:`metrics.matthews_corrcoef`) + and added macro and micro average options to + :func:`metrics.precision_score`, :func:`metrics.recall_score` and + :func:`metrics.f1_score` by `Satrajit Ghosh`_. + +- :ref:`out_of_bag` of generalization error for :ref:`ensemble` + by `Andreas Müller`_. + +- Randomized sparse linear models for feature + selection, by `Alexandre Gramfort`_ and `Gael Varoquaux`_ + +- :ref:`label_propagation` for semi-supervised learning, by Clay + Woolam. **Note** the semi-supervised API is still work in progress, + and may change. + +- Added BIC/AIC model selection to classical :ref:`gmm` and unified + the API with the remainder of scikit-learn, by `Bertrand Thirion`_ + +- Added :class:`sklearn.cross_validation.StratifiedShuffleSplit`, which is + a :class:`sklearn.cross_validation.ShuffleSplit` with balanced splits, + by Yannick Schwartz. + +- :class:`sklearn.neighbors.NearestCentroid` classifier added, along with a + ``shrink_threshold`` parameter, which implements **shrunken centroid + classification**, by `Robert Layton`_. + +Other changes +.............. + +- Merged dense and sparse implementations of :ref:`sgd` module and + exposed utility extension types for sequential + datasets ``seq_dataset`` and weight vectors ``weight_vector`` + by `Peter Prettenhofer`_. + +- Added ``partial_fit`` (support for online/minibatch learning) and + warm_start to the :ref:`sgd` module by `Mathieu Blondel`_. + +- Dense and sparse implementations of :ref:`svm` classes and + :class:`linear_model.LogisticRegression` merged by `Lars Buitinck`_. + +- Regressors can now be used as base estimator in the :ref:`multiclass` + module by `Mathieu Blondel`_. + +- Added n_jobs option to :func:`metrics.pairwise.pairwise_distances` + and :func:`metrics.pairwise.pairwise_kernels` for parallel computation, + by `Mathieu Blondel`_. + +- :ref:`k_means` can now be run in parallel, using the ``n_jobs`` argument + to either :ref:`k_means` or :class:`KMeans`, by `Robert Layton`_. + +- Improved :ref:`cross_validation` and :ref:`grid_search` documentation + and introduced the new :func:`cross_validation.train_test_split` + helper function by `Olivier Grisel`_ + +- :class:`svm.SVC` members ``coef_`` and ``intercept_`` changed sign for + consistency with ``decision_function``; for ``kernel==linear``, + ``coef_`` was fixed in the one-vs-one case, by `Andreas Müller`_. + +- Performance improvements to efficient leave-one-out cross-validated + Ridge regression, esp. for the ``n_samples > n_features`` case, in + :class:`linear_model.RidgeCV`, by Reuben Fletcher-Costin. + +- Refactoring and simplification of the :ref:`text_feature_extraction` + API and fixed a bug that caused possible negative IDF, + by `Olivier Grisel`_. + +- Beam pruning option in :class:`_BaseHMM` module has been removed since it + is difficult to Cythonize. If you are interested in contributing a Cython + version, you can use the python version in the git history as a reference. + +- Classes in :ref:`neighbors` now support arbitrary Minkowski metric for + nearest neighbors searches. The metric can be specified by argument ``p``. + +API changes summary +------------------- + +- :class:`covariance.EllipticEnvelop` is now deprecated - Please use :class:`covariance.EllipticEnvelope` + instead. + +- ``NeighborsClassifier`` and ``NeighborsRegressor`` are gone in the module + :ref:`neighbors`. Use the classes :class:`KNeighborsClassifier`, + :class:`RadiusNeighborsClassifier`, :class:`KNeighborsRegressor` + and/or :class:`RadiusNeighborsRegressor` instead. + +- Sparse classes in the :ref:`sgd` module are now deprecated. + +- In :class:`mixture.GMM`, :class:`mixture.DPGMM` and :class:`mixture.VBGMM`, + parameters must be passed to an object when initialising it and not through + ``fit``. Now ``fit`` will only accept the data as an input parameter. + +- methods ``rvs`` and ``decode`` in :class:`GMM` module are now deprecated. + ``sample`` and ``score`` or ``predict`` should be used instead. + +- attribute ``_scores`` and ``_pvalues`` in univariate feature selection + objects are now deprecated. + ``scores_`` or ``pvalues_`` should be used instead. + +- In :class:`LogisticRegression`, :class:`LinearSVC`, :class:`SVC` and + :class:`NuSVC`, the ``class_weight`` parameter is now an initialization + parameter, not a parameter to fit. This makes grid searches + over this parameter possible. + +- LFW ``data`` is now always shape ``(n_samples, n_features)`` to be + consistent with the Olivetti faces dataset. Use ``images`` and + ``pairs`` attribute to access the natural images shapes instead. + +- In :class:`svm.LinearSVC`, the meaning of the ``multi_class`` parameter + changed. Options now are ``'ovr'`` and ``'crammer_singer'``, with + ``'ovr'`` being the default. This does not change the default behavior + but hopefully is less confusing. + +- Class :class:`feature_selection.text.Vectorizer` is deprecated and + replaced by :class:`feature_selection.text.TfidfVectorizer`. + +- The preprocessor / analyzer nested structure for text feature + extraction has been removed. All those features are + now directly passed as flat constructor arguments + to :class:`feature_selection.text.TfidfVectorizer` and + :class:`feature_selection.text.CountVectorizer`, in particular the + following parameters are now used: + +- ``analyzer`` can be ``'word'`` or ``'char'`` to switch the default + analysis scheme, or use a specific python callable (as previously). + +- ``tokenizer`` and ``preprocessor`` have been introduced to make it + still possible to customize those steps with the new API. + +- ``input`` explicitly control how to interpret the sequence passed to + ``fit`` and ``predict``: filenames, file objects or direct (byte or + Unicode) strings. + +- charset decoding is explicit and strict by default. + +- the ``vocabulary``, fitted or not is now stored in the + ``vocabulary_`` attribute to be consistent with the project + conventions. + +- Class :class:`feature_selection.text.TfidfVectorizer` now derives directly + from :class:`feature_selection.text.CountVectorizer` to make grid + search trivial. + +- methods ``rvs`` in :class:`_BaseHMM` module are now deprecated. + ``sample`` should be used instead. + +- Beam pruning option in :class:`_BaseHMM` module is removed since it is + difficult to be Cythonized. If you are interested, you can look in the + history codes by git. + +- The SVMlight format loader now supports files with both zero-based and + one-based column indices, since both occur "in the wild". + +- Arguments in class :class:`ShuffleSplit` are now consistent with + :class:`StratifiedShuffleSplit`. Arguments ``test_fraction`` and + ``train_fraction`` are deprecated and renamed to ``test_size`` and + ``train_size`` and can accept both ``float`` and ``int``. + +- Arguments in class :class:`Bootstrap` are now consistent with + :class:`StratifiedShuffleSplit`. Arguments ``n_test`` and + ``n_train`` are deprecated and renamed to ``test_size`` and + ``train_size`` and can accept both ``float`` and ``int``. + +- Argument ``p`` added to classes in :ref:`neighbors` to specify an + arbitrary Minkowski metric for nearest neighbors searches. + + +People +------ + * 282 `Andreas Müller`_ + * 239 `Peter Prettenhofer`_ + * 198 `Gael Varoquaux`_ + * 129 `Olivier Grisel`_ + * 114 `Mathieu Blondel`_ + * 103 Clay Woolam + * 96 `Lars Buitinck`_ + * 88 `Jaques Grobler`_ + * 82 `Alexandre Gramfort`_ + * 50 `Bertrand Thirion`_ + * 42 `Robert Layton`_ + * 28 flyingimmidev + * 26 `Jake Vanderplas`_ + * 26 Shiqiao Du + * 21 `Satrajit Ghosh`_ + * 17 `David Marek`_ + * 17 `Gilles Louppe`_ + * 14 `Vlad Niculae`_ + * 11 Yannick Schwartz + * 10 `Fabian Pedregosa`_ + * 9 fcostin + * 7 Nick Wilson + * 5 Adrien Gaidon + * 5 `Nicolas Pinto`_ + * 4 `David Warde-Farley`_ + * 5 Nelle Varoquaux + * 5 Emmanuelle Gouillart + * 3 Joonas Sillanpää + * 3 Paolo Losi + * 2 Charles McCarthy + * 2 Roy Hyunjin Han + * 2 Scott White + * 2 ibayer + * 1 Brandyn White + * 1 Carlos Scheidegger + * 1 Claire Revillet + * 1 Conrad Lee + * 1 `Edouard Duchesnay`_ + * 1 Jan Hendrik Metzen + * 1 Meng Xinfan + * 1 `Rob Zinkov`_ + * 1 Shiqiao + * 1 Udi Weinsberg + * 1 Virgile Fritsch + * 1 Xinfan Meng + * 1 Yaroslav Halchenko + * 1 jansoe + * 1 Leon Palafox + + +.. _changes_0_10: + +Version 0.10 +============ + +**January 11, 2012** + +Changelog +--------- + +- Python 2.5 compatibility was dropped; the minimum Python version needed + to use scikit-learn is now 2.6. + +- :ref:`sparse_inverse_covariance` estimation using the graph Lasso, with + associated cross-validated estimator, by `Gael Varoquaux`_ + +- New :ref:`Tree ` module by `Brian Holt`_, `Peter Prettenhofer`_, + `Satrajit Ghosh`_ and `Gilles Louppe`_. The module comes with complete + documentation and examples. + +- Fixed a bug in the RFE module by `Gilles Louppe`_ (issue #378). + +- Fixed a memory leak in :ref:`svm` module by `Brian Holt`_ (issue #367). + +- Faster tests by `Fabian Pedregosa`_ and others. + +- Silhouette Coefficient cluster analysis evaluation metric added as + :func:`sklearn.metrics.silhouette_score` by Robert Layton. + +- Fixed a bug in :ref:`k_means` in the handling of the ``n_init`` parameter: + the clustering algorithm used to be run ``n_init`` times but the last + solution was retained instead of the best solution by `Olivier Grisel`_. + +- Minor refactoring in :ref:`sgd` module; consolidated dense and sparse + predict methods; Enhanced test time performance by converting model + parameters to fortran-style arrays after fitting (only multi-class). + +- Adjusted Mutual Information metric added as + :func:`sklearn.metrics.adjusted_mutual_info_score` by Robert Layton. + +- Models like SVC/SVR/LinearSVC/LogisticRegression from libsvm/liblinear + now support scaling of C regularization parameter by the number of + samples by `Alexandre Gramfort`_. + +- New :ref:`Ensemble Methods ` module by `Gilles Louppe`_ and + `Brian Holt`_. The module comes with the random forest algorithm and the + extra-trees method, along with documentation and examples. + +- :ref:`outlier_detection`: outlier and novelty detection, by + :user:`Virgile Fritsch `. + +- :ref:`kernel_approximation`: a transform implementing kernel + approximation for fast SGD on non-linear kernels by + `Andreas Müller`_. + +- Fixed a bug due to atom swapping in :ref:`OMP` by `Vlad Niculae`_. + +- :ref:`SparseCoder` by `Vlad Niculae`_. + +- :ref:`mini_batch_kmeans` performance improvements by `Olivier Grisel`_. + +- :ref:`k_means` support for sparse matrices by `Mathieu Blondel`_. + +- Improved documentation for developers and for the :mod:`sklearn.utils` + module, by `Jake Vanderplas`_. + +- Vectorized 20newsgroups dataset loader + (:func:`sklearn.datasets.fetch_20newsgroups_vectorized`) by + `Mathieu Blondel`_. + +- :ref:`multiclass` by `Lars Buitinck`_. + +- Utilities for fast computation of mean and variance for sparse matrices + by `Mathieu Blondel`_. + +- Make :func:`sklearn.preprocessing.scale` and + :class:`sklearn.preprocessing.Scaler` work on sparse matrices by + `Olivier Grisel`_ + +- Feature importances using decision trees and/or forest of trees, + by `Gilles Louppe`_. + +- Parallel implementation of forests of randomized trees by + `Gilles Louppe`_. + +- :class:`sklearn.cross_validation.ShuffleSplit` can subsample the train + sets as well as the test sets by `Olivier Grisel`_. + +- Errors in the build of the documentation fixed by `Andreas Müller`_. + + +API changes summary +------------------- + +Here are the code migration instructions when upgrading from scikit-learn +version 0.9: + +- Some estimators that may overwrite their inputs to save memory previously + had ``overwrite_`` parameters; these have been replaced with ``copy_`` + parameters with exactly the opposite meaning. + + This particularly affects some of the estimators in :mod:`linear_model`. + The default behavior is still to copy everything passed in. + +- The SVMlight dataset loader :func:`sklearn.datasets.load_svmlight_file` no + longer supports loading two files at once; use ``load_svmlight_files`` + instead. Also, the (unused) ``buffer_mb`` parameter is gone. + +- Sparse estimators in the :ref:`sgd` module use dense parameter vector + ``coef_`` instead of ``sparse_coef_``. This significantly improves + test time performance. + +- The :ref:`covariance` module now has a robust estimator of + covariance, the Minimum Covariance Determinant estimator. + +- Cluster evaluation metrics in :mod:`metrics.cluster` have been refactored + but the changes are backwards compatible. They have been moved to the + :mod:`metrics.cluster.supervised`, along with + :mod:`metrics.cluster.unsupervised` which contains the Silhouette + Coefficient. + +- The ``permutation_test_score`` function now behaves the same way as + ``cross_val_score`` (i.e. uses the mean score across the folds.) + +- Cross Validation generators now use integer indices (``indices=True``) + by default instead of boolean masks. This make it more intuitive to + use with sparse matrix data. + +- The functions used for sparse coding, ``sparse_encode`` and + ``sparse_encode_parallel`` have been combined into + :func:`sklearn.decomposition.sparse_encode`, and the shapes of the arrays + have been transposed for consistency with the matrix factorization setting, + as opposed to the regression setting. + +- Fixed an off-by-one error in the SVMlight/LibSVM file format handling; + files generated using :func:`sklearn.datasets.dump_svmlight_file` should be + re-generated. (They should continue to work, but accidentally had one + extra column of zeros prepended.) + +- ``BaseDictionaryLearning`` class replaced by ``SparseCodingMixin``. + +- :func:`sklearn.utils.extmath.fast_svd` has been renamed + :func:`sklearn.utils.extmath.randomized_svd` and the default + oversampling is now fixed to 10 additional random vectors instead + of doubling the number of components to extract. The new behavior + follows the reference paper. + + +People +------ + +The following people contributed to scikit-learn since last release: + + * 246 `Andreas Müller`_ + * 242 `Olivier Grisel`_ + * 220 `Gilles Louppe`_ + * 183 `Brian Holt`_ + * 166 `Gael Varoquaux`_ + * 144 `Lars Buitinck`_ + * 73 `Vlad Niculae`_ + * 65 `Peter Prettenhofer`_ + * 64 `Fabian Pedregosa`_ + * 60 Robert Layton + * 55 `Mathieu Blondel`_ + * 52 `Jake Vanderplas`_ + * 44 Noel Dawe + * 38 `Alexandre Gramfort`_ + * 24 :user:`Virgile Fritsch ` + * 23 `Satrajit Ghosh`_ + * 3 Jan Hendrik Metzen + * 3 Kenneth C. Arnold + * 3 Shiqiao Du + * 3 Tim Sheerman-Chase + * 3 `Yaroslav Halchenko`_ + * 2 Bala Subrahmanyam Varanasi + * 2 DraXus + * 2 Michael Eickenberg + * 1 Bogdan Trach + * 1 Félix-Antoine Fortin + * 1 Juan Manuel Caicedo Carvajal + * 1 Nelle Varoquaux + * 1 `Nicolas Pinto`_ + * 1 Tiziano Zito + * 1 Xinfan Meng + + + +.. _changes_0_9: + +Version 0.9 +=========== + +**September 21, 2011** + +scikit-learn 0.9 was released on September 2011, three months after the 0.8 +release and includes the new modules :ref:`manifold`, :ref:`dirichlet_process` +as well as several new algorithms and documentation improvements. + +This release also includes the dictionary-learning work developed by +`Vlad Niculae`_ as part of the `Google Summer of Code +`_ program. + + + +.. |banner1| image:: ./auto_examples/manifold/images/thumb/sphx_glr_plot_compare_methods_thumb.png + :target: auto_examples/manifold/plot_compare_methods.html + +.. |banner2| image:: ./auto_examples/linear_model/images/thumb/sphx_glr_plot_omp_thumb.png + :target: auto_examples/linear_model/plot_omp.html + +.. |banner3| image:: ./auto_examples/decomposition/images/thumb/sphx_glr_plot_kernel_pca_thumb.png + :target: auto_examples/decomposition/plot_kernel_pca.html + +.. |center-div| raw:: html + +
+ +.. |end-div| raw:: html + +
+ + +|center-div| |banner2| |banner1| |banner3| |end-div| + +Changelog +--------- + +- New :ref:`manifold` module by `Jake Vanderplas`_ and + `Fabian Pedregosa`_. + +- New :ref:`Dirichlet Process ` Gaussian Mixture + Model by `Alexandre Passos`_ + +- :ref:`neighbors` module refactoring by `Jake Vanderplas`_ : + general refactoring, support for sparse matrices in input, speed and + documentation improvements. See the next section for a full list of API + changes. + +- Improvements on the :ref:`feature_selection` module by + `Gilles Louppe`_ : refactoring of the RFE classes, documentation + rewrite, increased efficiency and minor API changes. + +- :ref:`SparsePCA` by `Vlad Niculae`_, `Gael Varoquaux`_ and + `Alexandre Gramfort`_ + +- Printing an estimator now behaves independently of architectures + and Python version thanks to :user:`Jean Kossaifi `. + +- :ref:`Loader for libsvm/svmlight format ` by + `Mathieu Blondel`_ and `Lars Buitinck`_ + +- Documentation improvements: thumbnails in + example gallery by `Fabian Pedregosa`_. + +- Important bugfixes in :ref:`svm` module (segfaults, bad + performance) by `Fabian Pedregosa`_. + +- Added :ref:`multinomial_naive_bayes` and :ref:`bernoulli_naive_bayes` + by `Lars Buitinck`_ + +- Text feature extraction optimizations by Lars Buitinck + +- Chi-Square feature selection + (:func:`feature_selection.univariate_selection.chi2`) by `Lars Buitinck`_. + +- :ref:`sample_generators` module refactoring by `Gilles Louppe`_ + +- :ref:`multiclass` by `Mathieu Blondel`_ + +- Ball tree rewrite by `Jake Vanderplas`_ + +- Implementation of :ref:`dbscan` algorithm by Robert Layton + +- Kmeans predict and transform by Robert Layton + +- Preprocessing module refactoring by `Olivier Grisel`_ + +- Faster mean shift by Conrad Lee + +- New ``Bootstrap``, :ref:`ShuffleSplit` and various other + improvements in cross validation schemes by `Olivier Grisel`_ and + `Gael Varoquaux`_ + +- Adjusted Rand index and V-Measure clustering evaluation metrics by `Olivier Grisel`_ + +- Added :class:`Orthogonal Matching Pursuit ` by `Vlad Niculae`_ + +- Added 2D-patch extractor utilities in the :ref:`feature_extraction` module by `Vlad Niculae`_ + +- Implementation of :class:`linear_model.LassoLarsCV` + (cross-validated Lasso solver using the Lars algorithm) and + :class:`linear_model.LassoLarsIC` (BIC/AIC model + selection in Lars) by `Gael Varoquaux`_ + and `Alexandre Gramfort`_ + +- Scalability improvements to :func:`metrics.roc_curve` by Olivier Hervieu + +- Distance helper functions :func:`metrics.pairwise.pairwise_distances` + and :func:`metrics.pairwise.pairwise_kernels` by Robert Layton + +- :class:`Mini-Batch K-Means ` by Nelle Varoquaux and Peter Prettenhofer. + +- :ref:`mldata` utilities by Pietro Berkes. + +- :ref:`olivetti_faces` by `David Warde-Farley`_. + + +API changes summary +------------------- + +Here are the code migration instructions when upgrading from scikit-learn +version 0.8: + +- The ``scikits.learn`` package was renamed ``sklearn``. There is + still a ``scikits.learn`` package alias for backward compatibility. + + Third-party projects with a dependency on scikit-learn 0.9+ should + upgrade their codebase. For instance, under Linux / MacOSX just run + (make a backup first!):: + + find -name "*.py" | xargs sed -i 's/\bscikits.learn\b/sklearn/g' + +- Estimators no longer accept model parameters as ``fit`` arguments: + instead all parameters must be only be passed as constructor + arguments or using the now public ``set_params`` method inherited + from :class:`base.BaseEstimator`. + + Some estimators can still accept keyword arguments on the ``fit`` + but this is restricted to data-dependent values (e.g. a Gram matrix + or an affinity matrix that are precomputed from the ``X`` data matrix. + +- The ``cross_val`` package has been renamed to ``cross_validation`` + although there is also a ``cross_val`` package alias in place for + backward compatibility. + + Third-party projects with a dependency on scikit-learn 0.9+ should + upgrade their codebase. For instance, under Linux / MacOSX just run + (make a backup first!):: + + find -name "*.py" | xargs sed -i 's/\bcross_val\b/cross_validation/g' + +- The ``score_func`` argument of the + ``sklearn.cross_validation.cross_val_score`` function is now expected + to accept ``y_test`` and ``y_predicted`` as only arguments for + classification and regression tasks or ``X_test`` for unsupervised + estimators. + +- ``gamma`` parameter for support vector machine algorithms is set + to ``1 / n_features`` by default, instead of ``1 / n_samples``. + +- The ``sklearn.hmm`` has been marked as orphaned: it will be removed + from scikit-learn in version 0.11 unless someone steps up to + contribute documentation, examples and fix lurking numerical + stability issues. + +- ``sklearn.neighbors`` has been made into a submodule. The two previously + available estimators, ``NeighborsClassifier`` and ``NeighborsRegressor`` + have been marked as deprecated. Their functionality has been divided + among five new classes: ``NearestNeighbors`` for unsupervised neighbors + searches, ``KNeighborsClassifier`` & ``RadiusNeighborsClassifier`` + for supervised classification problems, and ``KNeighborsRegressor`` + & ``RadiusNeighborsRegressor`` for supervised regression problems. + +- ``sklearn.ball_tree.BallTree`` has been moved to + ``sklearn.neighbors.BallTree``. Using the former will generate a warning. + +- ``sklearn.linear_model.LARS()`` and related classes (LassoLARS, + LassoLARSCV, etc.) have been renamed to + ``sklearn.linear_model.Lars()``. + +- All distance metrics and kernels in ``sklearn.metrics.pairwise`` now have a Y + parameter, which by default is None. If not given, the result is the distance + (or kernel similarity) between each sample in Y. If given, the result is the + pairwise distance (or kernel similarity) between samples in X to Y. + +- ``sklearn.metrics.pairwise.l1_distance`` is now called ``manhattan_distance``, + and by default returns the pairwise distance. For the component wise distance, + set the parameter ``sum_over_features`` to ``False``. + +Backward compatibility package aliases and other deprecated classes and +functions will be removed in version 0.11. + + +People +------ + +38 people contributed to this release. + +- 387 `Vlad Niculae`_ +- 320 `Olivier Grisel`_ +- 192 `Lars Buitinck`_ +- 179 `Gael Varoquaux`_ +- 168 `Fabian Pedregosa`_ (`INRIA`_, `Parietal Team`_) +- 127 `Jake Vanderplas`_ +- 120 `Mathieu Blondel`_ +- 85 `Alexandre Passos`_ +- 67 `Alexandre Gramfort`_ +- 57 `Peter Prettenhofer`_ +- 56 `Gilles Louppe`_ +- 42 Robert Layton +- 38 Nelle Varoquaux +- 32 :user:`Jean Kossaifi ` +- 30 Conrad Lee +- 22 Pietro Berkes +- 18 andy +- 17 David Warde-Farley +- 12 Brian Holt +- 11 Robert +- 8 Amit Aides +- 8 :user:`Virgile Fritsch ` +- 7 `Yaroslav Halchenko`_ +- 6 Salvatore Masecchia +- 5 Paolo Losi +- 4 Vincent Schut +- 3 Alexis Metaireau +- 3 Bryan Silverthorn +- 3 `Andreas Müller`_ +- 2 Minwoo Jake Lee +- 1 Emmanuelle Gouillart +- 1 Keith Goodman +- 1 Lucas Wiman +- 1 `Nicolas Pinto`_ +- 1 Thouis (Ray) Jones +- 1 Tim Sheerman-Chase + + +.. _changes_0_8: + +Version 0.8 +=========== + +**May 11, 2011** + +scikit-learn 0.8 was released on May 2011, one month after the first +"international" `scikit-learn coding sprint +`_ and is +marked by the inclusion of important modules: :ref:`hierarchical_clustering`, +:ref:`cross_decomposition`, :ref:`NMF`, initial support for Python 3 and by important +enhancements and bug fixes. + + +Changelog +--------- + +Several new modules where introduced during this release: + +- New :ref:`hierarchical_clustering` module by Vincent Michel, + `Bertrand Thirion`_, `Alexandre Gramfort`_ and `Gael Varoquaux`_. + +- :ref:`kernel_pca` implementation by `Mathieu Blondel`_ + +- :ref:`labeled_faces_in_the_wild` by `Olivier Grisel`_. + +- New :ref:`cross_decomposition` module by `Edouard Duchesnay`_. + +- :ref:`NMF` module `Vlad Niculae`_ + +- Implementation of the :ref:`oracle_approximating_shrinkage` algorithm by + :user:`Virgile Fritsch ` in the :ref:`covariance` module. + + +Some other modules benefited from significant improvements or cleanups. + + +- Initial support for Python 3: builds and imports cleanly, + some modules are usable while others have failing tests by `Fabian Pedregosa`_. + +- :class:`decomposition.PCA` is now usable from the Pipeline object by `Olivier Grisel`_. + +- Guide :ref:`performance-howto` by `Olivier Grisel`_. + +- Fixes for memory leaks in libsvm bindings, 64-bit safer BallTree by Lars Buitinck. + +- bug and style fixing in :ref:`k_means` algorithm by Jan Schlüter. + +- Add attribute converged to Gaussian Mixture Models by Vincent Schut. + +- Implemented ``transform``, ``predict_log_proba`` in + :class:`discriminant_analysis.LinearDiscriminantAnalysis` By `Mathieu Blondel`_. + +- Refactoring in the :ref:`svm` module and bug fixes by `Fabian Pedregosa`_, + `Gael Varoquaux`_ and Amit Aides. + +- Refactored SGD module (removed code duplication, better variable naming), + added interface for sample weight by `Peter Prettenhofer`_. + +- Wrapped BallTree with Cython by Thouis (Ray) Jones. + +- Added function :func:`svm.l1_min_c` by Paolo Losi. + +- Typos, doc style, etc. by `Yaroslav Halchenko`_, `Gael Varoquaux`_, + `Olivier Grisel`_, Yann Malet, `Nicolas Pinto`_, Lars Buitinck and + `Fabian Pedregosa`_. + + +People +------- + +People that made this release possible preceded by number of commits: + + +- 159 `Olivier Grisel`_ +- 96 `Gael Varoquaux`_ +- 96 `Vlad Niculae`_ +- 94 `Fabian Pedregosa`_ +- 36 `Alexandre Gramfort`_ +- 32 Paolo Losi +- 31 `Edouard Duchesnay`_ +- 30 `Mathieu Blondel`_ +- 25 `Peter Prettenhofer`_ +- 22 `Nicolas Pinto`_ +- 11 :user:`Virgile Fritsch ` + - 7 Lars Buitinck + - 6 Vincent Michel + - 5 `Bertrand Thirion`_ + - 4 Thouis (Ray) Jones + - 4 Vincent Schut + - 3 Jan Schlüter + - 2 Julien Miotte + - 2 `Matthieu Perrot`_ + - 2 Yann Malet + - 2 `Yaroslav Halchenko`_ + - 1 Amit Aides + - 1 `Andreas Müller`_ + - 1 Feth Arezki + - 1 Meng Xinfan + + +.. _changes_0_7: + +Version 0.7 +=========== + +**March 2, 2011** + +scikit-learn 0.7 was released in March 2011, roughly three months +after the 0.6 release. This release is marked by the speed +improvements in existing algorithms like k-Nearest Neighbors and +K-Means algorithm and by the inclusion of an efficient algorithm for +computing the Ridge Generalized Cross Validation solution. Unlike the +preceding release, no new modules where added to this release. + +Changelog +--------- + +- Performance improvements for Gaussian Mixture Model sampling [Jan + Schlüter]. + +- Implementation of efficient leave-one-out cross-validated Ridge in + :class:`linear_model.RidgeCV` [`Mathieu Blondel`_] + +- Better handling of collinearity and early stopping in + :func:`linear_model.lars_path` [`Alexandre Gramfort`_ and `Fabian + Pedregosa`_]. + +- Fixes for liblinear ordering of labels and sign of coefficients + [Dan Yamins, Paolo Losi, `Mathieu Blondel`_ and `Fabian Pedregosa`_]. + +- Performance improvements for Nearest Neighbors algorithm in + high-dimensional spaces [`Fabian Pedregosa`_]. + +- Performance improvements for :class:`cluster.KMeans` [`Gael + Varoquaux`_ and `James Bergstra`_]. + +- Sanity checks for SVM-based classes [`Mathieu Blondel`_]. + +- Refactoring of :class:`neighbors.NeighborsClassifier` and + :func:`neighbors.kneighbors_graph`: added different algorithms for + the k-Nearest Neighbor Search and implemented a more stable + algorithm for finding barycenter weights. Also added some + developer documentation for this module, see + `notes_neighbors + `_ for more information [`Fabian Pedregosa`_]. + +- Documentation improvements: Added :class:`pca.RandomizedPCA` and + :class:`linear_model.LogisticRegression` to the class + reference. Also added references of matrices used for clustering + and other fixes [`Gael Varoquaux`_, `Fabian Pedregosa`_, `Mathieu + Blondel`_, `Olivier Grisel`_, Virgile Fritsch , Emmanuelle + Gouillart] + +- Binded decision_function in classes that make use of liblinear_, + dense and sparse variants, like :class:`svm.LinearSVC` or + :class:`linear_model.LogisticRegression` [`Fabian Pedregosa`_]. + +- Performance and API improvements to + :func:`metrics.euclidean_distances` and to + :class:`pca.RandomizedPCA` [`James Bergstra`_]. + +- Fix compilation issues under NetBSD [Kamel Ibn Hassen Derouiche] + +- Allow input sequences of different lengths in :class:`hmm.GaussianHMM` + [`Ron Weiss`_]. + +- Fix bug in affinity propagation caused by incorrect indexing [Xinfan Meng] + + +People +------ + +People that made this release possible preceded by number of commits: + +- 85 `Fabian Pedregosa`_ +- 67 `Mathieu Blondel`_ +- 20 `Alexandre Gramfort`_ +- 19 `James Bergstra`_ +- 14 Dan Yamins +- 13 `Olivier Grisel`_ +- 12 `Gael Varoquaux`_ +- 4 `Edouard Duchesnay`_ +- 4 `Ron Weiss`_ +- 2 Satrajit Ghosh +- 2 Vincent Dubourg +- 1 Emmanuelle Gouillart +- 1 Kamel Ibn Hassen Derouiche +- 1 Paolo Losi +- 1 VirgileFritsch +- 1 `Yaroslav Halchenko`_ +- 1 Xinfan Meng + + +.. _changes_0_6: + +Version 0.6 +=========== + +**December 21, 2010** + +scikit-learn 0.6 was released on December 2010. It is marked by the +inclusion of several new modules and a general renaming of old +ones. It is also marked by the inclusion of new example, including +applications to real-world datasets. + + +Changelog +--------- + +- New `stochastic gradient + `_ descent + module by Peter Prettenhofer. The module comes with complete + documentation and examples. + +- Improved svm module: memory consumption has been reduced by 50%, + heuristic to automatically set class weights, possibility to + assign weights to samples (see + :ref:`sphx_glr_auto_examples_svm_plot_weighted_samples.py` for an example). + +- New :ref:`gaussian_process` module by Vincent Dubourg. This module + also has great documentation and some very neat examples. See + example_gaussian_process_plot_gp_regression.py or + example_gaussian_process_plot_gp_probabilistic_classification_after_regression.py + for a taste of what can be done. + +- It is now possible to use liblinear’s Multi-class SVC (option + multi_class in :class:`svm.LinearSVC`) + +- New features and performance improvements of text feature + extraction. + +- Improved sparse matrix support, both in main classes + (:class:`grid_search.GridSearchCV`) as in modules + sklearn.svm.sparse and sklearn.linear_model.sparse. + +- Lots of cool new examples and a new section that uses real-world + datasets was created. These include: + :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py`, + :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py`, + :ref:`sphx_glr_auto_examples_applications_svm_gui.py`, + :ref:`sphx_glr_auto_examples_applications_wikipedia_principal_eigenvector.py` and + others. + +- Faster :ref:`least_angle_regression` algorithm. It is now 2x + faster than the R version on worst case and up to 10x times faster + on some cases. + +- Faster coordinate descent algorithm. In particular, the full path + version of lasso (:func:`linear_model.lasso_path`) is more than + 200x times faster than before. + +- It is now possible to get probability estimates from a + :class:`linear_model.LogisticRegression` model. + +- module renaming: the glm module has been renamed to linear_model, + the gmm module has been included into the more general mixture + model and the sgd module has been included in linear_model. + +- Lots of bug fixes and documentation improvements. + + +People +------ + +People that made this release possible preceded by number of commits: + + * 207 `Olivier Grisel`_ + + * 167 `Fabian Pedregosa`_ + + * 97 `Peter Prettenhofer`_ + + * 68 `Alexandre Gramfort`_ + + * 59 `Mathieu Blondel`_ + + * 55 `Gael Varoquaux`_ + + * 33 Vincent Dubourg + + * 21 `Ron Weiss`_ + + * 9 Bertrand Thirion + + * 3 `Alexandre Passos`_ + + * 3 Anne-Laure Fouque + + * 2 Ronan Amicel + + * 1 `Christian Osendorfer`_ + + + +.. _changes_0_5: + + +Version 0.5 +=========== + +**October 11, 2010** + +Changelog +--------- + +New classes +----------- + +- Support for sparse matrices in some classifiers of modules + ``svm`` and ``linear_model`` (see :class:`svm.sparse.SVC`, + :class:`svm.sparse.SVR`, :class:`svm.sparse.LinearSVC`, + :class:`linear_model.sparse.Lasso`, :class:`linear_model.sparse.ElasticNet`) + +- New :class:`pipeline.Pipeline` object to compose different estimators. + +- Recursive Feature Elimination routines in module + :ref:`feature_selection`. + +- Addition of various classes capable of cross validation in the + linear_model module (:class:`linear_model.LassoCV`, :class:`linear_model.ElasticNetCV`, + etc.). + +- New, more efficient LARS algorithm implementation. The Lasso + variant of the algorithm is also implemented. See + :class:`linear_model.lars_path`, :class:`linear_model.Lars` and + :class:`linear_model.LassoLars`. + +- New Hidden Markov Models module (see classes + :class:`hmm.GaussianHMM`, :class:`hmm.MultinomialHMM`, + :class:`hmm.GMMHMM`) + +- New module feature_extraction (see :ref:`class reference + `) + +- New FastICA algorithm in module sklearn.fastica + + +Documentation +------------- + +- Improved documentation for many modules, now separating + narrative documentation from the class reference. As an example, + see `documentation for the SVM module + `_ and the + complete `class reference + `_. + +Fixes +----- + +- API changes: adhere variable names to PEP-8, give more + meaningful names. + +- Fixes for svm module to run on a shared memory context + (multiprocessing). + +- It is again possible to generate latex (and thus PDF) from the + sphinx docs. + +Examples +-------- + +- new examples using some of the mlcomp datasets: + ``sphx_glr_auto_examples_mlcomp_sparse_document_classification.py`` (since removed) and + :ref:`sphx_glr_auto_examples_text_document_classification_20newsgroups.py` + +- Many more examples. `See here + `_ + the full list of examples. + + +External dependencies +--------------------- + +- Joblib is now a dependency of this package, although it is + shipped with (sklearn.externals.joblib). + +Removed modules +--------------- + +- Module ann (Artificial Neural Networks) has been removed from + the distribution. Users wanting this sort of algorithms should + take a look into pybrain. + +Misc +---- + +- New sphinx theme for the web page. + + +Authors +------- + +The following is a list of authors for this release, preceded by +number of commits: + + * 262 Fabian Pedregosa + * 240 Gael Varoquaux + * 149 Alexandre Gramfort + * 116 Olivier Grisel + * 40 Vincent Michel + * 38 Ron Weiss + * 23 Matthieu Perrot + * 10 Bertrand Thirion + * 7 Yaroslav Halchenko + * 9 VirgileFritsch + * 6 Edouard Duchesnay + * 4 Mathieu Blondel + * 1 Ariel Rokem + * 1 Matthieu Brucher + +Version 0.4 +=========== + +**August 26, 2010** + +Changelog +--------- + +Major changes in this release include: + +- Coordinate Descent algorithm (Lasso, ElasticNet) refactoring & + speed improvements (roughly 100x times faster). + +- Coordinate Descent Refactoring (and bug fixing) for consistency + with R's package GLMNET. + +- New metrics module. + +- New GMM module contributed by Ron Weiss. + +- Implementation of the LARS algorithm (without Lasso variant for now). + +- feature_selection module redesign. + +- Migration to GIT as version control system. + +- Removal of obsolete attrselect module. + +- Rename of private compiled extensions (added underscore). + +- Removal of legacy unmaintained code. + +- Documentation improvements (both docstring and rst). + +- Improvement of the build system to (optionally) link with MKL. + Also, provide a lite BLAS implementation in case no system-wide BLAS is + found. + +- Lots of new examples. + +- Many, many bug fixes ... + + +Authors +------- + +The committer list for this release is the following (preceded by number +of commits): + + * 143 Fabian Pedregosa + * 35 Alexandre Gramfort + * 34 Olivier Grisel + * 11 Gael Varoquaux + * 5 Yaroslav Halchenko + * 2 Vincent Michel + * 1 Chris Filo Gorgolewski + + +Earlier versions +================ + +Earlier versions included contributions by Fred Mailhot, David Cooke, +David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. + +.. _Olivier Grisel: https://twitter.com/ogrisel + +.. _Gael Varoquaux: http://gael-varoquaux.info + +.. _Alexandre Gramfort: http://alexandre.gramfort.net + +.. _Fabian Pedregosa: http://fa.bianp.net + +.. _Mathieu Blondel: http://www.mblondel.org + +.. _James Bergstra: http://www-etud.iro.umontreal.ca/~bergstrj/ + +.. _liblinear: http://www.csie.ntu.edu.tw/~cjlin/liblinear/ + +.. _Yaroslav Halchenko: http://www.onerussian.com/ + +.. _Vlad Niculae: http://vene.ro + +.. _Edouard Duchesnay: https://sites.google.com/site/duchesnay/home + +.. _Peter Prettenhofer: https://sites.google.com/site/peterprettenhofer/ + +.. _Alexandre Passos: http://atpassos.me + +.. _Nicolas Pinto: https://twitter.com/npinto + +.. _Bertrand Thirion: https://team.inria.fr/parietal/bertrand-thirions-page + +.. _Andreas Müller: http://peekaboo-vision.blogspot.com + +.. _Matthieu Perrot: http://brainvisa.info/biblio/lnao/en/Author/PERROT-M.html + +.. _Jake Vanderplas: http://staff.washington.edu/jakevdp/ + +.. _Gilles Louppe: http://www.montefiore.ulg.ac.be/~glouppe/ + +.. _INRIA: http://www.inria.fr + +.. _Parietal Team: http://parietal.saclay.inria.fr/ + +.. _David Warde-Farley: http://www-etud.iro.umontreal.ca/~wardefar/ + +.. _Brian Holt: http://personal.ee.surrey.ac.uk/Personal/B.Holt + +.. _Satrajit Ghosh: http://www.mit.edu/~satra/ + +.. _Robert Layton: https://twitter.com/robertlayton + +.. _Scott White: https://twitter.com/scottblanc + +.. _David Marek: http://www.davidmarek.cz/ + +.. _Christian Osendorfer: https://osdf.github.io + +.. _Arnaud Joly: http://www.ajoly.org + +.. _Rob Zinkov: http://zinkov.com + +.. _Joel Nothman: http://joelnothman.com + +.. _Nicolas Trésegnie : http://nicolastr.com/ + +.. _Kemal Eren: http://www.kemaleren.com + +.. _Yann Dauphin: http://ynd.github.io/ + +.. _Yannick Schwartz: https://team.inria.fr/parietal/schwarty/ + +.. _Kyle Kastner: http://kastnerkyle.github.io + +.. _Daniel Nouri: http://danielnouri.org + +.. _Manoj Kumar: https://manojbits.wordpress.com + +.. _Luis Pedro Coelho: http://luispedro.org + +.. _Fares Hedyati: http://www.eecs.berkeley.edu/~fareshed + +.. _Antony Lee: https://www.ocf.berkeley.edu/~antonyl/ + +.. _Martin Billinger: http://tnsre.embs.org/author/martinbillinger + +.. _Matteo Visconti di Oleggio Castello: http://www.mvdoc.me + +.. _Trevor Stephens: http://trevorstephens.com/ + +.. _Jan Hendrik Metzen: https://jmetzen.github.io/ + +.. _Will Dawson: http://www.dawsonresearch.com + +.. _Andrew Tulloch: http://tullo.ch/ + +.. _Hanna Wallach: http://dirichlet.net/ + +.. _Yan Yi: http://seowyanyi.org + +.. _Hervé Bredin: http://herve.niderb.fr/ + +.. _Eric Martin: http://www.ericmart.in + +.. _Nicolas Goix: https://perso.telecom-paristech.fr/~goix/ + +.. _Sebastian Raschka: http://sebastianraschka.com + +.. _Brian McFee: https://bmcfee.github.io + +.. _Valentin Stolbunov: http://www.vstolbunov.com + +.. _Jaques Grobler: https://github.com/jaquesgrobler + +.. _Lars Buitinck: https://github.com/larsmans + +.. _Loic Esteve: https://github.com/lesteve + +.. _Noel Dawe: https://github.com/ndawe + +.. _Raghav RV: https://github.com/raghavrv + +.. _Tom Dupre la Tour: https://github.com/TomDLT + +.. _Nelle Varoquaux: https://github.com/nellev + +.. _Bing Tian Dai: https://github.com/btdai + +.. _Dylan Werner-Meier: https://github.com/unautre + +.. _Alyssa Batula: https://github.com/abatula + +.. _Srivatsan Ramesh: https://github.com/srivatsan-ramesh + +.. _Ron Weiss: http://www.ee.columbia.edu/~ronw + +.. _Kathleen Chen: https://github.com/kchen17 + +.. _Vincent Pham: https://github.com/vincentpham1991 + +.. _Denis Engemann: http://denis-engemann.de +.. _Anish Shah: https://github.com/AnishShah + +.. _Neeraj Gangwar: http://neerajgangwar.in +.. _Arthur Mensch: https://amensch.fr + +.. _Ivan Nazarov: https://github.com/ivannz From d86b3fdb53c5168d3fe3ee10cc46c0703bccc93b Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Sun, 9 Apr 2017 02:27:26 +0300 Subject: [PATCH 03/41] Update to address #8711 --- doc/whats_new.rst | 1 + sklearn/svm/_classes.py | 20 ++++++++++++++++++++ sklearn/svm/tests/test_svm.py | 3 ++- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index a2e79cb930838..6baab9d087f36 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -243,6 +243,7 @@ Model selection and evaluation cumulative gain (NDCG). :issue:`7739` by :user:`David Gasquez `. By `Arthur Mensch`_. + - Added the :class:`svm.SVDD` class for novelty detection based on soft minimal volume hypersphere around the sample data. By `Ivan Nazarov`_. diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 918e7f3f8a116..d19ba44dad173 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1964,3 +1964,23 @@ def decision_function(self, X): """ dec = self._decision_function(X) return dec + + def predict(self, X): + """ + Perform classification on samples in X. + + For an one-class model, +1 or -1 is returned. + + Parameters + ---------- + X : {array-like, sparse matrix}, shape (n_samples, n_features) + For kernel="precomputed", the expected shape of X is + [n_samples_test, n_samples_train] + + Returns + ------- + y_pred : array, shape (n_samples,) + Class labels for samples in X. + """ + y = super(SVDD, self).predict(X) + return np.asarray(y, dtype=np.intp) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index b3b864826c546..a19285e4b8728 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -368,7 +368,8 @@ def test_svdd(): clf.fit(X) pred = clf.predict(T) - assert_array_almost_equal(pred, [-1, -1, -1]) + assert_array_equal(pred, [-1, -1, -1]) + assert_equal(pred.dtype, np.dtype('intp')) assert_array_almost_equal(clf.intercept_, [0.491], decimal=3) assert_array_almost_equal(clf.dual_coef_, [[0.632, 0.233, 0.633, 0.234, 0.632, 0.633]], From 766a3444aacccd7f2b61e84df15476e78d9bb284 Mon Sep 17 00:00:00 2001 From: Ivan Date: Fri, 9 Jun 2017 01:15:16 +0300 Subject: [PATCH 04/41] docstring fix reflecting #9048 --- sklearn/svm/_classes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index d19ba44dad173..2abefe1dd8d7d 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1951,7 +1951,9 @@ def fit(self, X, y=None, sample_weight=None, **params): return self def decision_function(self, X): - """Distance of the samples X to the separating hyperplane. + """Signed distance to the enveloping hypersphere. + + Signed distance is positive for an inlier and negative for an outlier. Parameters ---------- From 4093763bf972c6ae15d4832626f43a909ed8c283 Mon Sep 17 00:00:00 2001 From: Ivan Date: Fri, 15 Sep 2017 04:16:51 +0200 Subject: [PATCH 05/41] updated what's new according to #9505 --- doc/whats_new.rst | 5783 ------------------------------- doc/whats_new/_contributors.rst | 4 +- doc/whats_new/v0.20.rst | 4 + 3 files changed, 7 insertions(+), 5784 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 6baab9d087f36..3354a6b13f32b 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -28,5786 +28,3 @@ on libraries.io to be notified when new versions are released. Version 0.14 Version 0.13 Older Versions - -Version 0.20 (under development) -================================ - -Changed models --------------- - -The following estimators and functions, when fit with the same data and -parameters, may produce different models from the previous version. This often -occurs due to changes in the modelling logic (bug fixes or enhancements), or in -random sampling procedures. - -- :class:`decomposition.IncrementalPCA` in Python 2 (bug fix) - -Details are listed in the changelog below. - -(While we are trying to better inform users by providing this information, we -cannot assure that this list is complete.) - -Changelog ---------- - -New features -............ - -Classifiers and regressors - -- :class:`ensemble.GradientBoostingClassifier` and - :class:`ensemble.GradientBoostingRegressor` now support early stopping - via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071` - by `Raghav RV`_ - -- Added :class:`naive_bayes.ComplementNB`, which implements the Complement - Naive Bayes classifier described in Rennie et al. (2003). - By :user:`Michael A. Alcorn `. - -Enhancements -............ - -Model evaluation and meta-estimators - -- A scorer based on :func:`metrics.brier_score_loss` is also available. - :issue:`9521` by :user:`Hanmin Qin `. - -Bug fixes -......... - -Decomposition, manifold learning and clustering - -- Fix for uninformative error in :class:`decomposition.incremental_pca`: - now an error is raised if the number of components is larger than the - chosen batch size. The ``n_components=None`` case was adapted accordingly. - :issue:`6452`. By :user:`Wally Gauze `. - -- Fixed a bug where the ``partial_fit`` method of - :class:`decomposition.IncrementalPCA` used integer division instead of float - division on Python 2 versions. :issue:`9492` by - :user:`James Bourbeau `. - -Version 0.19 -============ - -**Release Candidate (0.19b2) July 17, 2017** - -Highlights ----------- - -We are excited to release a number of great new features including -:class:`neighbors.LocalOutlierFactor` for anomaly detection, -:class:`preprocessing.QuantileTransformer` for robust feature transformation, -and the :class:`multioutput.ClassifierChain` meta-estimator to simply account -for dependencies between classes in multilabel problems. We have some new -algorithms in existing estimators, such as multiplicative update in -:class:`decomposition.NMF` and multinomial -:class:`linear_model.LogisticRegression` with L1 loss (use ``solver='saga'``). - -Cross validation is now able to return the results from multiple metric -evaluations. The new :func:`model_selection.cross_validate` can return many -scores on the test data as well as training set performance and timings, and we -have extended the ``scoring`` and ``refit`` parameters for grid/randomized -search :ref:`to handle multiple metrics `. - -You can also learn faster. For instance, the :ref:`new option to cache -transformations ` in :class:`pipeline.Pipeline` makes grid -search over pipelines including slow transformations much more efficient. And -you can predict faster: if you're sure you know what you're doing, you can turn -off validating that the input is finite using :func:`config_context`. - -We've made some important fixes too. We've fixed a longstanding implementation -error in :func:`metrics.average_precision_score`, so please be cautious with -prior results reported from that function. A number of errors in the -:class:`manifold.TSNE` implementation have been fixed, particularly in the -default Barnes-Hut approximation. :class:`semi_supervised.LabelSpreading` and -:class:`semi_supervised.LabelPropagation` have had substantial fixes. -LabelPropagation was previously broken. LabelSpreading should now correctly -respect its alpha parameter. - -Changed models --------------- - -The following estimators and functions, when fit with the same data and -parameters, may produce different models from the previous version. This often -occurs due to changes in the modelling logic (bug fixes or enhancements), or in -random sampling procedures. - -- :class:`cluster.KMeans` with sparse X and initial centroids given (bug fix) -- :class:`cross_decomposition.PLSRegression` - with ``scale=True`` (bug fix) -- :class:`ensemble.GradientBoostingClassifier` and - :class:`ensemble.GradientBoostingRegressor` where ``min_impurity_split`` is used (bug fix) -- gradient boosting ``loss='quantile'`` (bug fix) -- :class:`ensemble.IsolationForest` (bug fix) -- :class:`feature_selection.SelectFdr` (bug fix) -- :class:`linear_model.RANSACRegressor` (bug fix) -- :class:`linear_model.LassoLars` (bug fix) -- :class:`linear_model.LassoLarsIC` (bug fix) -- :class:`manifold.TSNE` (bug fix) -- :class:`neighbors.NearestCentroid` (bug fix) -- :class:`semi_supervised.LabelSpreading` (bug fix) -- :class:`semi_supervised.LabelPropagation` (bug fix) -- tree based models where ``min_weight_fraction_leaf`` is used (enhancement) - -Details are listed in the changelog below. - -(While we are trying to better inform users by providing this information, we -cannot assure that this list is complete.) - -Changelog ---------- - -New features -............ - -Classifiers and regressors - -- Added :class:`multioutput.ClassifierChain` for multi-label - classification. By `Adam Kleczewski `_. - -- Added solver ``'saga'`` that implements the improved version of Stochastic - Average Gradient, in :class:`linear_model.LogisticRegression` and - :class:`linear_model.Ridge`. It allows the use of L1 penalty with - multinomial logistic loss, and behaves marginally better than 'sag' - during the first epochs of ridge and logistic regression. - :issue:`8446` by `Arthur Mensch`_. - -Other estimators - -- Added the :class:`neighbors.LocalOutlierFactor` class for anomaly - detection based on nearest neighbors. - :issue:`5279` by `Nicolas Goix`_ and `Alexandre Gramfort`_. - -- Added :class:`preprocessing.QuantileTransformer` class and - :func:`preprocessing.quantile_transform` function for features - normalization based on quantiles. - :issue:`8363` by :user:`Denis Engemann `, - :user:`Guillaume Lemaitre `, `Olivier Grisel`_, `Raghav RV`_, - :user:`Thierry Guillemot `, and `Gael Varoquaux`_. - -- The new solver ``'mu'`` implements a Multiplicate Update in - :class:`decomposition.NMF`, allowing the optimization of all - beta-divergences, including the Frobenius norm, the generalized - Kullback-Leibler divergence and the Itakura-Saito divergence. - :issue:`5295` by `Tom Dupre la Tour`_. - -Model selection and evaluation - -- :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` now support simultaneous - evaluation of multiple metrics. Refer to the - :ref:`multimetric_grid_search` section of the user guide for more - information. :issue:`7388` by `Raghav RV`_ - -- Added the :func:`model_selection.cross_validate` which allows evaluation - of multiple metrics. This function returns a dict with more useful - information from cross-validation such as the train scores, fit times and - score times. - Refer to :ref:`multimetric_cross_validation` section of the userguide - for more information. :issue:`7388` by `Raghav RV`_ - -- Added :func:`metrics.mean_squared_log_error`, which computes - the mean square error of the logarithmic transformation of targets, - particularly useful for targets with an exponential trend. - :issue:`7655` by :user:`Karan Desai `. - -- Added :func:`metrics.dcg_score` and :func:`metrics.ndcg_score`, which - compute Discounted cumulative gain (DCG) and Normalized discounted - cumulative gain (NDCG). - :issue:`7739` by :user:`David Gasquez `. - -- Added the :class:`model_selection.RepeatedKFold` and - :class:`model_selection.RepeatedStratifiedKFold`. - :issue:`8120` by `Neeraj Gangwar`_. - - :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` now support simultaneous - evaluation of multiple metrics. Refer to the - :ref:`multimetric_grid_search` section of the user guide for more - information. :issue:`7388` by `Raghav RV`_ - - - Added the :func:`model_selection.cross_validate` which allows evaluation - of multiple metrics. This function returns a dict with more useful - information from cross-validation such as the train scores, fit times and - score times. - Refer to :ref:`multimetric_cross_validation` section of the userguide - for more information. :issue:`7388` by `Raghav RV`_ - - - Added :func:`metrics.mean_squared_log_error`, which computes - the mean square error of the logarithmic transformation of targets, - particularly useful for targets with an exponential trend. - :issue:`7655` by :user:`Karan Desai `. - - - Added :func:`metrics.dcg_score` and :func:`metrics.ndcg_score`, which - compute Discounted cumulative gain (DCG) and Normalized discounted - cumulative gain (NDCG). - :issue:`7739` by :user:`David Gasquez `. - By `Arthur Mensch`_. - - - Added the :class:`svm.SVDD` class for novelty detection based on - soft minimal volume hypersphere around the sample data. - By `Ivan Nazarov`_. - - - Added the :class:`model_selection.RepeatedKFold` and - :class:`model_selection.RepeatedStratifiedKFold`. - :issue:`8120` by `Neeraj Gangwar`_. - -Miscellaneous - -- Validation that input data contains no NaN or inf can now be suppressed - using :func:`config_context`, at your own risk. This will save on runtime, - and may be particularly useful for prediction time. :issue:`7548` by - `Joel Nothman`_. - -- Added a test to ensure parameter listing in docstrings match the - function/class signature. :issue:`9206` by `Alexandre Gramfort`_ and - `Raghav RV`_. - -Enhancements -............ - -Trees and ensembles - -- The ``min_weight_fraction_leaf`` constraint in tree construction is now - more efficient, taking a fast path to declare a node a leaf if its weight - is less than 2 * the minimum. Note that the constructed tree will be - different from previous versions where ``min_weight_fraction_leaf`` is - used. :issue:`7441` by :user:`Nelson Liu `. - -- :class:`ensemble.GradientBoostingClassifier` and :class:`ensemble.GradientBoostingRegressor` - now support sparse input for prediction. - :issue:`6101` by :user:`Ibraim Ganiev `. - -- :class:`ensemble.VotingClassifier` now allows changing estimators by using - :meth:`ensemble.VotingClassifier.set_params`. An estimator can also be - removed by setting it to ``None``. - :issue:`7674` by :user:`Yichuan Liu `. - -- :func:`tree.export_graphviz` now shows configurable number of decimal - places. :issue:`8698` by :user:`Guillaume Lemaitre `. - -- Added ``flatten_transform`` parameter to :class:`ensemble.VotingClassifier` - to change output shape of `transform` method to 2 dimensional. - :issue:`7794` by :user:`Ibraim Ganiev ` and - :user:`Herilalaina Rakotoarison `. - -Linear, kernelized and related models - -- :class:`linear_model.SGDClassifier`, :class:`linear_model.SGDRegressor`, - :class:`linear_model.PassiveAggressiveClassifier`, - :class:`linear_model.PassiveAggressiveRegressor` and - :class:`linear_model.Perceptron` now expose ``max_iter`` and - ``tol`` parameters, to handle convergence more precisely. - ``n_iter`` parameter is deprecated, and the fitted estimator exposes - a ``n_iter_`` attribute, with actual number of iterations before - convergence. :issue:`5036` by `Tom Dupre la Tour`_. - -- Added ``average`` parameter to perform weight averaging in - :class:`linear_model.PassiveAggressiveClassifier`. :issue:`4939` - by :user:`Andrea Esuli `. - -- :class:`linear_model.RANSACRegressor` no longer throws an error - when calling ``fit`` if no inliers are found in its first iteration. - Furthermore, causes of skipped iterations are tracked in newly added - attributes, ``n_skips_*``. - :issue:`7914` by :user:`Michael Horrell `. - -- In :class:`gaussian_process.GaussianProcessRegressor`, method ``predict`` - is a lot faster with ``return_std=True``. :issue:`8591` by - :user:`Hadrien Bertrand `. - -- Added ``return_std`` to ``predict`` method of - :class:`linear_model.ARDRegression` and - :class:`linear_model.BayesianRidge`. - :issue:`7838` by :user:`Sergey Feldman `. - -- Memory usage enhancements: Prevent cast from float32 to float64 in: - :class:`linear_model.MultiTaskElasticNet`; - :class:`linear_model.LogisticRegression` when using newton-cg solver; and - :class:`linear_model.Ridge` when using svd, sparse_cg, cholesky or lsqr - solvers. :issue:`8835`, :issue:`8061` by :user:`Joan Massich ` and :user:`Nicolas - Cordier ` and :user:`Thierry Guillemot `. - -Other predictors - -- Custom metrics for the :mod:`neighbors` binary trees now have - fewer constraints: they must take two 1d-arrays and return a float. - :issue:`6288` by `Jake Vanderplas`_. - -- ``algorithm='auto`` in :mod:`neighbors` estimators now chooses the most - appropriate algorithm for all input types and metrics. :issue:`9145` by - :user:`Herilalaina Rakotoarison ` and :user:`Reddy Chinthala - `. - -Decomposition, manifold learning and clustering - -- :class:`cluster.MiniBatchKMeans` and :class:`cluster.KMeans` - now use significantly less memory when assigning data points to their - nearest cluster center. :issue:`7721` by :user:`Jon Crall `. - -- :class:`decomposition.PCA`, :class:`decomposition.IncrementalPCA` and - :class:`decomposition.TruncatedSVD` now expose the singular values - from the underlying SVD. They are stored in the attribute - ``singular_values_``, like in :class:`decomposition.IncrementalPCA`. - :issue:`7685` by :user:`Tommy Löfstedt ` - -- :class:`decomposition.NMF` now faster when ``beta_loss=0``. - :issue:`9277` by :user:`hongkahjun`. - -- Memory improvements for method ``barnes_hut`` in :class:`manifold.TSNE` - :issue:`7089` by :user:`Thomas Moreau ` and `Olivier Grisel`_. - -- Optimization schedule improvements for Barnes-Hut :class:`manifold.TSNE` - so the results are closer to the one from the reference implementation - `lvdmaaten/bhtsne `_ by :user:`Thomas - Moreau ` and `Olivier Grisel`_. - -- Memory usage enhancements: Prevent cast from float32 to float64 in - :class:`decomposition.PCA` and - :func:`decomposition.randomized_svd_low_rank`. - :issue:`9067` by `Raghav RV`_. - -Preprocessing and feature selection - -- Added ``norm_order`` parameter to :class:`feature_selection.SelectFromModel` - to enable selection of the norm order when ``coef_`` is more than 1D. - :issue:`6181` by :user:`Antoine Wendlinger `. - -- Added ability to use sparse matrices in :func:`feature_selection.f_regression` - with ``center=True``. :issue:`8065` by :user:`Daniel LeJeune `. - -- Small performance improvement to n-gram creation in - :mod:`feature_extraction.text` by binding methods for loops and - special-casing unigrams. :issue:`7567` by :user:`Jaye Doepke ` - -- Relax assumption on the data for the - :class:`kernel_approximation.SkewedChi2Sampler`. Since the Skewed-Chi2 - kernel is defined on the open interval :math:`(-skewedness; +\infty)^d`, - the transform function should not check whether ``X < 0`` but whether ``X < - -self.skewedness``. :issue:`7573` by :user:`Romain Brault `. - -- Made default kernel parameters kernel-dependent in - :class:`kernel_approximation.Nystroem`. - :issue:`5229` by :user:`Saurabh Bansod ` and `Andreas Müller`_. - -Model evaluation and meta-estimators - -- :class:`pipeline.Pipeline` is now able to cache transformers - within a pipeline by using the ``memory`` constructor parameter. - :issue:`7990` by :user:`Guillaume Lemaitre `. - -- :class:`pipeline.Pipeline` steps can now be accessed as attributes of its - ``named_steps`` attribute. :issue:`8586` by :user:`Herilalaina - Rakotoarison `. - -- Added ``sample_weight`` parameter to :meth:`pipeline.Pipeline.score`. - :issue:`7723` by :user:`Mikhail Korobov `. - -- Added ability to set ``n_jobs`` parameter to :func:`pipeline.make_union`. - A ``TypeError`` will be raised for any other kwargs. :issue:`8028` - by :user:`Alexander Booth `. - -- :class:`model_selection.GridSearchCV`, - :class:`model_selection.RandomizedSearchCV` and - :func:`model_selection.cross_val_score` now allow estimators with callable - kernels which were previously prohibited. - :issue:`8005` by `Andreas Müller`_ . - -- :func:`model_selection.cross_val_predict` now returns output of the - correct shape for all values of the argument ``method``. - :issue:`7863` by :user:`Aman Dalmia `. - -- Added ``shuffle`` and ``random_state`` parameters to shuffle training - data before taking prefixes of it based on training sizes in - :func:`model_selection.learning_curve`. - :issue:`7506` by :user:`Narine Kokhlikyan `. - -- :class:`model_selection.StratifiedShuffleSplit` now works with multioutput - multiclass (or multilabel) data. :issue:`9044` by `Vlad Niculae`_. - -- Speed improvements to :class:`model_selection.StratifiedShuffleSplit`. - :issue:`5991` by :user:`Arthur Mensch ` and `Joel Nothman`_. - -- Add ``shuffle`` parameter to :func:`model_selection.train_test_split`. - :issue:`8845` by :user:`themrmax ` - -- :class:`multioutput.MultiOutputRegressor` and :class:`multioutput.MultiOutputClassifier` - now support online learning using ``partial_fit``. - :issue: `8053` by :user:`Peng Yu `. - -- Add ``max_train_size`` parameter to :class:`model_selection.TimeSeriesSplit` - :issue:`8282` by :user:`Aman Dalmia `. - -- More clustering metrics are now available through :func:`metrics.get_scorer` - and ``scoring`` parameters. :issue:`8117` by `Raghav RV`_. - -- A scorer based on :func:`metrics.explained_variance_score` is also available. - :issue:`9259` by :user:`Hanmin Qin `. - -Metrics - -- :func:`metrics.matthews_corrcoef` now support multiclass classification. - :issue:`8094` by :user:`Jon Crall `. - -- Add ``sample_weight`` parameter to :func:`metrics.cohen_kappa_score`. - :issue:`8335` by :user:`Victor Poughon `. - -Miscellaneous - -- :func:`utils.check_estimator` now attempts to ensure that methods - transform, predict, etc. do not set attributes on the estimator. - :issue:`7533` by :user:`Ekaterina Krivich `. - -- Added type checking to the ``accept_sparse`` parameter in - :mod:`utils.validation` methods. This parameter now accepts only boolean, - string, or list/tuple of strings. ``accept_sparse=None`` is deprecated and - should be replaced by ``accept_sparse=False``. - :issue:`7880` by :user:`Josh Karnofsky `. - -- Make it possible to load a chunk of an svmlight formatted file by - passing a range of bytes to :func:`datasets.load_svmlight_file`. - :issue:`935` by :user:`Olivier Grisel `. - -- :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor` - now accept non-finite features. :issue:`8931` by :user:`Attractadore`. - -Bug fixes -......... - -Trees and ensembles - -- Fixed a memory leak in trees when using trees with ``criterion='mae'``. - :issue:`8002` by `Raghav RV`_. - -- Fixed a bug where :class:`ensemble.IsolationForest` uses an - an incorrect formula for the average path length - :issue:`8549` by `Peter Wang `_. - -- Fixed a bug where :class:`ensemble.AdaBoostClassifier` throws - ``ZeroDivisionError`` while fitting data with single class labels. - :issue:`7501` by :user:`Dominik Krzeminski `. - -- Fixed a bug in :class:`ensemble.GradientBoostingClassifier` and - :class:`ensemble.GradientBoostingRegressor` where a float being compared - to ``0.0`` using ``==`` caused a divide by zero error. :issue:`7970` by - :user:`He Chen `. - -- Fix a bug where :class:`ensemble.GradientBoostingClassifier` and - :class:`ensemble.GradientBoostingRegressor` ignored the - ``min_impurity_split`` parameter. - :issue:`8006` by :user:`Sebastian Pölsterl `. - -- Fixed ``oob_score`` in :class:`ensemble.BaggingClassifier`. - :issue:`8936` by :user:`Michael Lewis ` - -- Fixed excessive memory usage in prediction for random forests estimators. - :issue:`8672` by :user:`Mike Benfield `. - -- Fixed a bug where ``sample_weight`` as a list broke random forests in Python 2 - :issue:`8068` by :user:`xor`. - -- Fixed a bug where :class:`ensemble.IsolationForest` fails when - ``max_features`` is less than 1. - :issue:`5732` by :user:`Ishank Gulati `. - -- Fix a bug where gradient boosting with ``loss='quantile'`` computed - negative errors for negative values of ``ytrue - ypred`` leading to wrong - values when calling ``__call__``. - :issue:`8087` by :user:`Alexis Mignon ` - -- Fix a bug where :class:`ensemble.VotingClassifier` raises an error - when a numpy array is passed in for weights. :issue:`7983` by - :user:`Vincent Pham `. - -- Fixed a bug where :func:`tree.export_graphviz` raised an error - when the length of features_names does not match n_features in the decision - tree. :issue:`8512` by :user:`Li Li `. - -Linear, kernelized and related models - -- Fixed a bug where :func:`linear_model.RANSACRegressor.fit` may run until - ``max_iter`` if it finds a large inlier group early. :issue:`8251` by - :user:`aivision2020`. - -- Fixed a bug where :class:`naive_bayes.MultinomialNB` and - :class:`naive_bayes.BernoulliNB` failed when ``alpha=0``. :issue:`5814` by - :user:`Yichuan Liu ` and :user:`Herilalaina Rakotoarison - `. - -- Fixed a bug where :class:`linear_model.LassoLars` does not give - the same result as the LassoLars implementation available - in R (lars library). :issue:`7849` by :user:`Jair Montoya Martinez `. - -- Fixed a bug in :class:`linear_model.RandomizedLasso`, - :class:`linear_model.Lars`, :class:`linear_model.LassoLars`, - :class:`linear_model.LarsCV` and :class:`linear_model.LassoLarsCV`, - where the parameter ``precompute`` was not used consistently across - classes, and some values proposed in the docstring could raise errors. - :issue:`5359` by `Tom Dupre la Tour`_. - -- Fix inconsistent results between :class:`linear_model.RidgeCV` and - :class:`linear_model.Ridge` when using ``normalize=True``. :issue:`9302` - by `Alexandre Gramfort`_. - -- Fix a bug where :func:`linear_model.LassoLars.fit` sometimes - left ``coef_`` as a list, rather than an ndarray. - :issue:`8160` by :user:`CJ Carey `. - -- Fix :func:`linear_model.BayesianRidge.fit` to return - ridge parameter ``alpha_`` and ``lambda_`` consistent with calculated - coefficients ``coef_`` and ``intercept_``. - :issue:`8224` by :user:`Peter Gedeck `. - -- Fixed a bug in :class:`svm.OneClassSVM` where it returned floats instead of - integer classes. :issue:`8676` by :user:`Vathsala Achar `. - -- Fix AIC/BIC criterion computation in :class:`linear_model.LassoLarsIC`. - :issue:`9022` by `Alexandre Gramfort`_ and :user:`Mehmet Basbug `. - -- Fixed a memory leak in our LibLinear implementation. :issue:`9024` by - :user:`Sergei Lebedev ` - -- Fix bug where stratified CV splitters did not work with - :class:`linear_model.LassoCV`. :issue:`8973` by - :user:`Paulo Haddad `. - -- Fixed a bug in :class:`gaussian_process.GaussianProcessRegressor` - when the standard deviation and covariance predicted without fit - would fail with a unmeaningful error by default. - :issue:`6573` by :user:`Quazi Marufur Rahman ` and - `Manoj Kumar`_. - -Other predictors - -- Fix :class:`semi_supervised.BaseLabelPropagation` to correctly implement - ``LabelPropagation`` and ``LabelSpreading`` as done in the referenced - papers. :issue:`9239` - by :user:`Andre Ambrosio Boechat `, :user:`Utkarsh Upadhyay - `, and `Joel Nothman`_. - -Decomposition, manifold learning and clustering - -- Fixed the implementation of :class:`manifold.TSNE`: -- ``early_exageration`` parameter had no effect and is now used for the - first 250 optimization iterations. -- Fixed the ``AssertionError: Tree consistency failed`` exception - reported in :issue:`8992`. -- Improve the learning schedule to match the one from the reference - implementation `lvdmaaten/bhtsne `_. - by :user:`Thomas Moreau ` and `Olivier Grisel`_. - -- Fix a bug in :class:`decomposition.LatentDirichletAllocation` - where the ``perplexity`` method was returning incorrect results because - the ``transform`` method returns normalized document topic distributions - as of version 0.18. :issue:`7954` by :user:`Gary Foreman `. - -- Fix output shape and bugs with n_jobs > 1 in - :class:`decomposition.SparseCoder` transform and - :func:`decomposition.sparse_encode` - for one-dimensional data and one component. - This also impacts the output shape of :class:`decomposition.DictionaryLearning`. - :issue:`8086` by `Andreas Müller`_. - -- Fixed the implementation of ``explained_variance_`` - in :class:`decomposition.PCA`, - :class:`decomposition.RandomizedPCA` and - :class:`decomposition.IncrementalPCA`. - :issue:`9105` by `Hanmin Qin `_. - -- Fixed the implementation of noise_variance_ in :class:`decomposition.PCA`. - :issue:`9108` by `Hanmin Qin `_. - -- Fixed a bug where :class:`cluster.DBSCAN` gives incorrect - result when input is a precomputed sparse matrix with initial - rows all zero. :issue:`8306` by :user:`Akshay Gupta ` - -- Fix a bug regarding fitting :class:`cluster.KMeans` with a sparse - array X and initial centroids, where X's means were unnecessarily being - subtracted from the centroids. :issue:`7872` by :user:`Josh Karnofsky `. - -- Fixes to the input validation in :class:`covariance.EllipticEnvelope`. - :issue:`8086` by `Andreas Müller`_. - -- Fixed a bug in :class:`covariance.MinCovDet` where inputting data - that produced a singular covariance matrix would cause the helper method - ``_c_step`` to throw an exception. - :issue:`3367` by :user:`Jeremy Steward ` - -- Fixed a bug in :class:`manifold.TSNE` affecting convergence of the - gradient descent. :issue:`8768` by :user:`David DeTomaso `. - -- Fixed a bug in :class:`manifold.TSNE` where it stored the incorrect - ``kl_divergence_``. :issue:`6507` by :user:`Sebastian Saeger `. - -- Fixed improper scaling in :class:`cross_decomposition.PLSRegression` - with ``scale=True``. :issue:`7819` by :user:`jayzed82 `. - -- :class:`cluster.bicluster.SpectralCoclustering` and - :class:`cluster.bicluster.SpectralBiclustering` ``fit`` method conforms - with API by accepting ``y`` and returning the object. :issue:`6126`, - :issue:`7814` by :user:`Laurent Direr ` and :user:`Maniteja - Nandana `. - -- Fix bug where :mod:`mixture` ``sample`` methods did not return as many - samples as requested. :issue:`7702` by :user:`Levi John Wolf `. - -- Fixed the shrinkage implementation in :class:`neighbors.NearestCentroid`. - :issue:`9219` by `Hanmin Qin `_. - -Preprocessing and feature selection - -- For sparse matrices, :func:`preprocessing.normalize` with ``return_norm=True`` - will now raise a ``NotImplementedError`` with 'l1' or 'l2' norm and with - norm 'max' the norms returned will be the same as for dense matrices. - :issue:`7771` by `Ang Lu `_. - -- Fix a bug where :class:`feature_selection.SelectFdr` did not - exactly implement Benjamini-Hochberg procedure. It formerly may have - selected fewer features than it should. - :issue:`7490` by :user:`Peng Meng `. - -- Fixed a bug where :class:`linear_model.RandomizedLasso` and - :class:`linear_model.RandomizedLogisticRegression` breaks for - sparse input. :issue:`8259` by :user:`Aman Dalmia `. - -- Fix a bug where :class:`feature_extraction.FeatureHasher` - mandatorily applied a sparse random projection to the hashed features, - preventing the use of - :class:`feature_extraction.text.HashingVectorizer` in a - pipeline with :class:`feature_extraction.text.TfidfTransformer`. - :issue:`7565` by :user:`Roman Yurchak `. - -- Fix a bug where :class:`feature_selection.mutual_info_regression` did not - correctly use ``n_neighbors``. :issue:`8181` by :user:`Guillaume Lemaitre - `. - -Model evaluation and meta-estimators - -- Fixed a bug where :func:`model_selection.BaseSearchCV.inverse_transform` - returns ``self.best_estimator_.transform()`` instead of - ``self.best_estimator_.inverse_transform()``. - :issue:`8344` by :user:`Akshay Gupta ` and :user:`Rasmus Eriksson `. - -- Added ``classes_`` attribute to :class:`model_selection.GridSearchCV`, - :class:`model_selection.RandomizedSearchCV`, :class:`grid_search.GridSearchCV`, - and :class:`grid_search.RandomizedSearchCV` that matches the ``classes_`` - attribute of ``best_estimator_``. :issue:`7661` and :issue:`8295` - by :user:`Alyssa Batula `, :user:`Dylan Werner-Meier `, - and :user:`Stephen Hoover `. - -- Fixed a bug where :func:`model_selection.validation_curve` - reused the same estimator for each parameter value. - :issue:`7365` by :user:`Aleksandr Sandrovskii `. - -- :func:`model_selection.permutation_test_score` now works with Pandas - types. :issue:`5697` by :user:`Stijn Tonk `. - -- Several fixes to input validation in - :class:`multiclass.OutputCodeClassifier` - :issue:`8086` by `Andreas Müller`_. - -- :class:`multiclass.OneVsOneClassifier`'s ``partial_fit`` now ensures all - classes are provided up-front. :issue:`6250` by - :user:`Asish Panda `. - -- Fix :func:`multioutput.MultiOutputClassifier.predict_proba` to return a - list of 2d arrays, rather than a 3d array. In the case where different - target columns had different numbers of classes, a ``ValueError`` would be - raised on trying to stack matrices with different dimensions. - :issue:`8093` by :user:`Peter Bull `. - -- Cross validation now works with Pandas datatypes that that have a - read-only index. :issue:`9507` by `Loic Esteve`_. - -Metrics - -- :func:`metrics.average_precision_score` no longer linearly - interpolates between operating points, and instead weighs precisions - by the change in recall since the last operating point, as per the - `Wikipedia entry `_. - (`#7356 `_). By - :user:`Nick Dingwall ` and `Gael Varoquaux`_. - -- Fix a bug in :func:`metrics.classification._check_targets` - which would return ``'binary'`` if ``y_true`` and ``y_pred`` were - both ``'binary'`` but the union of ``y_true`` and ``y_pred`` was - ``'multiclass'``. :issue:`8377` by `Loic Esteve`_. - -- Fixed an integer overflow bug in :func:`metrics.confusion_matrix` and - hence :func:`metrics.cohen_kappa_score`. :issue:`8354`, :issue:`7929` - by `Joel Nothman`_ and :user:`Jon Crall `. - -- Fixed passing of ``gamma`` parameter to the ``chi2`` kernel in - :func:`metrics.pairwise.pairwise_kernels` :issue:`5211` by - :user:`Nick Rhinehart `, - :user:`Saurabh Bansod ` and `Andreas Müller`_. - -Miscellaneous - -- Fixed a bug when :func:`datasets.make_classification` fails - when generating more than 30 features. :issue:`8159` by - :user:`Herilalaina Rakotoarison `. - -- Fixed a bug where :func:`datasets.make_moons` gives an - incorrect result when ``n_samples`` is odd. - :issue:`8198` by :user:`Josh Levy `. - -- Some ``fetch_`` functions in :mod:`datasets` were ignoring the - ``download_if_missing`` keyword. :issue:`7944` by :user:`Ralf Gommers `. - -- Fix estimators to accept a ``sample_weight`` parameter of type - ``pandas.Series`` in their ``fit`` function. :issue:`7825` by - `Kathleen Chen`_. - -- Fix a bug in cases where ``numpy.cumsum`` may be numerically unstable, - raising an exception if instability is identified. :issue:`7376` and - :issue:`7331` by `Joel Nothman`_ and :user:`yangarbiter`. - -- Fix a bug where :meth:`base.BaseEstimator.__getstate__` - obstructed pickling customizations of child-classes, when used in a - multiple inheritance context. - :issue:`8316` by :user:`Holger Peters `. - -- Update Sphinx-Gallery from 0.1.4 to 0.1.7 for resolving links in - documentation build with Sphinx>1.5 :issue:`8010`, :issue:`7986` by - :user:`Oscar Najera ` - -- Add ``data_home`` parameter to :func:`sklearn.datasets.fetch_kddcup99`. - :issue:`9289` by `Loic Esteve`_. - -- Fix dataset loaders using Python 3 version of makedirs to also work in - Python 2. :issue:`9284` by :user:`Sebastin Santy `. - -- Several minor issues were fixed with thanks to the alerts of - [lgtm.com](http://lgtm.com). :issue:`9278` by :user:`Jean Helie `, - among others. - -API changes summary -------------------- - -Trees and ensembles - -- Gradient boosting base models are no longer estimators. By `Andreas Müller`_. - -- All tree based estimators now accept a ``min_impurity_decrease`` - parameter in lieu of the ``min_impurity_split``, which is now deprecated. - The ``min_impurity_decrease`` helps stop splitting the nodes in which - the weighted impurity decrease from splitting is no longer alteast - ``min_impurity_decrease``. :issue:`8449` by `Raghav RV`_. - -Linear, kernelized and related models - -- ``n_iter`` parameter is deprecated in :class:`linear_model.SGDClassifier`, - :class:`linear_model.SGDRegressor`, - :class:`linear_model.PassiveAggressiveClassifier`, - :class:`linear_model.PassiveAggressiveRegressor` and - :class:`linear_model.Perceptron`. By `Tom Dupre la Tour`_. - -Other predictors - -- :class:`neighbors.LSHForest` has been deprecated and will be - removed in 0.21 due to poor performance. - :issue:`9078` by :user:`Laurent Direr `. - -- :class:`neighbors.NearestCentroid` no longer purports to support - ``metric='precomputed'`` which now raises an error. :issue:`8515` by - :user:`Sergul Aydore `. - -- The ``alpha`` parameter of :class:`semi_supervised.LabelPropagation` now - has no effect and is deprecated to be removed in 0.21. :issue:`9239` - by :user:`Andre Ambrosio Boechat `, :user:`Utkarsh Upadhyay - `, and `Joel Nothman`_. - -Decomposition, manifold learning and clustering - -- Deprecate the ``doc_topic_distr`` argument of the ``perplexity`` method - in :class:`decomposition.LatentDirichletAllocation` because the - user no longer has access to the unnormalized document topic distribution - needed for the perplexity calculation. :issue:`7954` by - :user:`Gary Foreman `. - -- The ``n_topics`` parameter of :class:`decomposition.LatentDirichletAllocation` - has been renamed to ``n_components`` and will be removed in version 0.21. - :issue:`8922` by :user:`Attractadore`. - -- :meth:`decomposition.SparsePCA.transform`'s ``ridge_alpha`` parameter is - deprecated in preference for class parameter. - :issue:`8137` by :user:`Naoya Kanai `. - -- :class:`cluster.DBSCAN` now has a ``metric_params`` parameter. - :issue:`8139` by :user:`Naoya Kanai `. - -Preprocessing and feature selection - -- :class:`feature_selection.SelectFromModel` now has a ``partial_fit`` - method only if the underlying estimator does. By `Andreas Müller`_. - -- :class:`feature_selection.SelectFromModel` now validates the ``threshold`` - parameter and sets the ``threshold_`` attribute during the call to - ``fit``, and no longer during the call to ``transform```. By `Andreas - Müller`_. - -- The ``non_negative`` parameter in :class:`feature_extraction.FeatureHasher` - has been deprecated, and replaced with a more principled alternative, - ``alternate_sign``. - :issue:`7565` by :user:`Roman Yurchak `. - -- :class:`linear_model.RandomizedLogisticRegression`, - and :class:`linear_model.RandomizedLasso` have been deprecated and will - be removed in version 0.21. - :issue:`8995` by :user:`Ramana.S `. - -Model evaluation and meta-estimators - -- Deprecate the ``fit_params`` constructor input to the - :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` in favor - of passing keyword parameters to the ``fit`` methods - of those classes. Data-dependent parameters needed for model - training should be passed as keyword arguments to ``fit``, - and conforming to this convention will allow the hyperparameter - selection classes to be used with tools such as - :func:`model_selection.cross_val_predict`. - :issue:`2879` by :user:`Stephen Hoover `. - -- In version 0.21, the default behavior of splitters that use the - ``test_size`` and ``train_size`` parameter will change, such that - specifying ``train_size`` alone will cause ``test_size`` to be the - remainder. :issue:`7459` by :user:`Nelson Liu `. - -- :class:`multiclass.OneVsRestClassifier` now has ``partial_fit``, - ``decision_function`` and ``predict_proba`` methods only when the - underlying estimator does. :issue:`7812` by `Andreas Müller`_ and - :user:`Mikhail Korobov `. - -- :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method - only if the underlying estimator does. By `Andreas Müller`_. - -- The ``decision_function`` output shape for binary classification in - :class:`multiclass.OneVsRestClassifier` and - :class:`multiclass.OneVsOneClassifier` is now ``(n_samples,)`` to conform - to scikit-learn conventions. :issue:`9100` by `Andreas Müller`_. - -- The :func:`multioutput.MultiOutputClassifier.predict_proba` - function used to return a 3d array (``n_samples``, ``n_classes``, - ``n_outputs``). In the case where different target columns had different - numbers of classes, a ``ValueError`` would be raised on trying to stack - matrices with different dimensions. This function now returns a list of - arrays where the length of the list is ``n_outputs``, and each array is - (``n_samples``, ``n_classes``) for that particular output. - :issue:`8093` by :user:`Peter Bull `. - -- Replace attribute ``named_steps`` ``dict`` to :class:`utils.Bunch` - in :class:`pipeline.Pipeline` to enable tab completion in interactive - environment. In the case conflict value on ``named_steps`` and ``dict`` - attribute, ``dict`` behavior will be prioritized. - :issue:`8481` by :user:`Herilalaina Rakotoarison `. - -Miscellaneous - -- Deprecate the ``y`` parameter in ``transform`` and ``inverse_transform``. - The method should not accept ``y`` parameter, as it's used at the prediction time. - :issue:`8174` by :user:`Tahar Zanouda `, `Alexandre Gramfort`_ - and `Raghav RV`_. - -- SciPy >= 0.13.3 and NumPy >= 1.8.2 are now the minimum supported versions - for scikit-learn. The following backported functions in - :mod:`utils` have been removed or deprecated accordingly. - :issue:`8854` and :issue:`8874` by :user:`Naoya Kanai ` - -- The ``store_covariances`` and ``covariances_`` parameters of - :class:`discriminant_analysis.QuadraticDiscriminantAnalysis` - has been renamed to ``store_covariance`` and ``covariance_`` to be - consistent with the corresponding parameter names of the - :class:`discriminant_analysis.LinearDiscriminantAnalysis`. They will be - removed in version 0.21. :issue:`7998` by :user:`Jiacheng ` - - Removed in 0.19: - - - ``utils.fixes.argpartition`` - - ``utils.fixes.array_equal`` - - ``utils.fixes.astype`` - - ``utils.fixes.bincount`` - - ``utils.fixes.expit`` - - ``utils.fixes.frombuffer_empty`` - - ``utils.fixes.in1d`` - - ``utils.fixes.norm`` - - ``utils.fixes.rankdata`` - - ``utils.fixes.safe_copy`` - - Deprecated in 0.19, to be removed in 0.21: - - - ``utils.arpack.eigs`` - - ``utils.arpack.eigsh`` - - ``utils.arpack.svds`` - - ``utils.extmath.fast_dot`` - - ``utils.extmath.logsumexp`` - - ``utils.extmath.norm`` - - ``utils.extmath.pinvh`` - - ``utils.graph.graph_laplacian`` - - ``utils.random.choice`` - - ``utils.sparsetools.connected_components`` - - ``utils.stats.rankdata`` - -- Estimators with both methods ``decision_function`` and ``predict_proba`` - are now required to have a monotonic relation between them. The - method ``check_decision_proba_consistency`` has been added in - **utils.estimator_checks** to check their consistency. - :issue:`7578` by :user:`Shubham Bhardwaj ` - -- All checks in ``utils.estimator_checks``, in particular - :func:`utils.estimator_checks.check_estimator` now accept estimator - instances. Most other checks do not accept - estimator classes any more. :issue:`9019` by `Andreas Müller`_. - -- Ensure that estimators' attributes ending with ``_`` are not set - in the constructor but only in the ``fit`` method. Most notably, - ensemble estimators (deriving from :class:`ensemble.BaseEnsemble`) - now only have ``self.estimators_`` available after ``fit``. - :issue:`7464` by `Lars Buitinck`_ and `Loic Esteve`_. - - -Code and Documentation Contributors ------------------------------------ - -Thanks to everyone who has contributed to the maintenance and improvement of the -project since version 0.18, including: - -Joel Nothman, Loic Esteve, Andreas Mueller, Guillaume Lemaitre, Olivier Grisel, -Hanmin Qin, Raghav RV, Alexandre Gramfort, themrmax, Aman Dalmia, Gael -Varoquaux, Naoya Kanai, Tom Dupré la Tour, Rishikesh, Nelson Liu, Taehoon Lee, -Nelle Varoquaux, Aashil, Mikhail Korobov, Sebastin Santy, Joan Massich, Roman -Yurchak, RAKOTOARISON Herilalaina, Thierry Guillemot, Alexandre Abadie, Carol -Willing, Balakumaran Manoharan, Josh Karnofsky, Vlad Niculae, Utkarsh Upadhyay, -Dmitry Petrov, Minghui Liu, Srivatsan, Vincent Pham, Albert Thomas, Jake -VanderPlas, Attractadore, JC Liu, alexandercbooth, chkoar, Óscar Nájera, -Aarshay Jain, Kyle Gilliam, Ramana Subramanyam, CJ Carey, Clement Joudet, David -Robles, He Chen, Joris Van den Bossche, Karan Desai, Katie Luangkote, Leland -McInnes, Maniteja Nandana, Michele Lacchia, Sergei Lebedev, Shubham Bhardwaj, -akshay0724, omtcyfz, rickiepark, waterponey, Vathsala Achar, jbDelafosse, Ralf -Gommers, Ekaterina Krivich, Vivek Kumar, Ishank Gulati, Dave Elliott, ldirer, -Reiichiro Nakano, Levi John Wolf, Mathieu Blondel, Sid Kapur, Dougal J. -Sutherland, midinas, mikebenfield, Sourav Singh, Aseem Bansal, Ibraim Ganiev, -Stephen Hoover, AishwaryaRK, Steven C. Howell, Gary Foreman, Neeraj Gangwar, -Tahar, Jon Crall, dokato, Kathy Chen, ferria, Thomas Moreau, Charlie Brummitt, -Nicolas Goix, Adam Kleczewski, Sam Shleifer, Nikita Singh, Basil Beirouti, -Giorgio Patrini, Manoj Kumar, Rafael Possas, James Bourbeau, James A. Bednar, -Janine Harper, Jaye, Jean Helie, Jeremy Steward, Artsiom, John Wei, Jonathan -LIgo, Jonathan Rahn, seanpwilliams, Arthur Mensch, Josh Levy, Julian Kuhlmann, -Julien Aubert, Jörn Hees, Kai, shivamgargsya, Kat Hempstalk, Kaushik -Lakshmikanth, Kennedy, Kenneth Lyons, Kenneth Myers, Kevin Yap, Kirill Bobyrev, -Konstantin Podshumok, Arthur Imbert, Lee Murray, toastedcornflakes, Lera, Li -Li, Arthur Douillard, Mainak Jas, tobycheese, Manraj Singh, Manvendra Singh, -Marc Meketon, MarcoFalke, Matthew Brett, Matthias Gilch, Mehul Ahuja, Melanie -Goetz, Meng, Peng, Michael Dezube, Michal Baumgartner, vibrantabhi19, Artem -Golubin, Milen Paskov, Antonin Carette, Morikko, MrMjauh, NALEPA Emmanuel, -Namiya, Antoine Wendlinger, Narine Kokhlikyan, NarineK, Nate Guerin, Angus -Williams, Ang Lu, Nicole Vavrova, Nitish Pandey, Okhlopkov Daniil Olegovich, -Andy Craze, Om Prakash, Parminder Singh, Patrick Carlson, Patrick Pei, Paul -Ganssle, Paulo Haddad, Paweł Lorek, Peng Yu, Pete Bachant, Peter Bull, Peter -Csizsek, Peter Wang, Pieter Arthur de Jong, Ping-Yao, Chang, Preston Parry, -Puneet Mathur, Quentin Hibon, Andrew Smith, Andrew Jackson, 1kastner, Rameshwar -Bhaskaran, Rebecca Bilbro, Remi Rampin, Andrea Esuli, Rob Hall, Robert -Bradshaw, Romain Brault, Aman Pratik, Ruifeng Zheng, Russell Smith, Sachin -Agarwal, Sailesh Choyal, Samson Tan, Samuël Weber, Sarah Brown, Sebastian -Pölsterl, Sebastian Raschka, Sebastian Saeger, Alyssa Batula, Abhyuday Pratap -Singh, Sergey Feldman, Sergul Aydore, Sharan Yalburgi, willduan, Siddharth -Gupta, Sri Krishna, Almer, Stijn Tonk, Allen Riddell, Theofilos Papapanagiotou, -Alison, Alexis Mignon, Tommy Boucher, Tommy Löfstedt, Toshihiro Kamishima, -Tyler Folkman, Tyler Lanigan, Alexander Junge, Varun Shenoy, Victor Poughon, -Vilhelm von Ehrenheim, Aleksandr Sandrovskii, Alan Yee, Vlasios Vasileiou, -Warut Vijitbenjaronk, Yang Zhang, Yaroslav Halchenko, Yichuan Liu, Yuichi -Fujikawa, affanv14, aivision2020, xor, andreh7, brady salz, campustrampus, -Agamemnon Krasoulis, ditenberg, elena-sharova, filipj8, fukatani, gedeck, -guiniol, guoci, hakaa1, hongkahjun, i-am-xhy, jakirkham, jaroslaw-weber, -jayzed82, jeroko, jmontoyam, jonathan.striebel, josephsalmon, jschendel, -leereeves, martin-hahn, mathurinm, mehak-sachdeva, mlewis1729, mlliou112, -mthorrell, ndingwall, nuffe, yangarbiter, plagree, pldtc325, Breno Freitas, -Brett Olsen, Brian A. Alfano, Brian Burns, polmauri, Brandon Carter, Charlton -Austin, Chayant T15h, Chinmaya Pancholi, Christian Danielsen, Chung Yen, -Chyi-Kwei Yau, pravarmahajan, DOHMATOB Elvis, Daniel LeJeune, Daniel Hnyk, -Darius Morawiec, David DeTomaso, David Gasquez, David Haberthür, David -Heryanto, David Kirkby, David Nicholson, rashchedrin, Deborah Gertrude Digges, -Denis Engemann, Devansh D, Dickson, Bob Baxley, Don86, E. Lynch-Klarup, Ed -Rogers, Elizabeth Ferriss, Ellen-Co2, Fabian Egli, Fang-Chieh Chou, Bing Tian -Dai, Greg Stupp, Grzegorz Szpak, Bertrand Thirion, Hadrien Bertrand, Harizo -Rajaona, zxcvbnius, Henry Lin, Holger Peters, Icyblade Dai, Igor -Andriushchenko, Ilya, Isaac Laughlin, Iván Vallés, Aurélien Bellet, JPFrancoia, -Jacob Schreiber, Asish Mahapatra - -.. _changes_0_18_2: - -Version 0.18.2 -============== - -**June 20, 2017** - -.. topic:: Last release with Python 2.6 support - - Scikit-learn 0.18 is the last major release of scikit-learn to support Python 2.6. - Later versions of scikit-learn will require Python 2.7 or above. - - -Changelog ---------- - -- Fixes for compatibility with NumPy 1.13.0: :issue:`7946` :issue:`8355` by - `Loic Esteve`_. - -- Minor compatibility changes in the examples :issue:`9010` :issue:`8040` - :issue:`9149`. - -Code Contributors ------------------ -Aman Dalmia, Loic Esteve, Nate Guerin, Sergei Lebedev - - -.. _changes_0_18_1: - -Version 0.18.1 -============== - -**November 11, 2016** - -Changelog ---------- - -Enhancements -............ - -- Improved ``sample_without_replacement`` speed by utilizing - numpy.random.permutation for most cases. As a result, - samples may differ in this release for a fixed random state. - Affected estimators: - - - :class:`ensemble.BaggingClassifier` - - :class:`ensemble.BaggingRegressor` - - :class:`linear_model.RANSACRegressor` - - :class:`model_selection.RandomizedSearchCV` - - :class:`random_projection.SparseRandomProjection` - - This also affects the :meth:`datasets.make_classification` - method. - -Bug fixes -......... - -- Fix issue where ``min_grad_norm`` and ``n_iter_without_progress`` - parameters were not being utilised by :class:`manifold.TSNE`. - :issue:`6497` by :user:`Sebastian Säger ` - -- Fix bug for svm's decision values when ``decision_function_shape`` - is ``ovr`` in :class:`svm.SVC`. - :class:`svm.SVC`'s decision_function was incorrect from versions - 0.17.0 through 0.18.0. - :issue:`7724` by `Bing Tian Dai`_ - -- Attribute ``explained_variance_ratio`` of - :class:`discriminant_analysis.LinearDiscriminantAnalysis` calculated - with SVD and Eigen solver are now of the same length. :issue:`7632` - by :user:`JPFrancoia ` - -- Fixes issue in :ref:`univariate_feature_selection` where score - functions were not accepting multi-label targets. :issue:`7676` - by :user:`Mohammed Affan ` - -- Fixed setting parameters when calling ``fit`` multiple times on - :class:`feature_selection.SelectFromModel`. :issue:`7756` by `Andreas Müller`_ - -- Fixes issue in ``partial_fit`` method of - :class:`multiclass.OneVsRestClassifier` when number of classes used in - ``partial_fit`` was less than the total number of classes in the - data. :issue:`7786` by `Srivatsan Ramesh`_ - -- Fixes issue in :class:`calibration.CalibratedClassifierCV` where - the sum of probabilities of each class for a data was not 1, and - ``CalibratedClassifierCV`` now handles the case where the training set - has less number of classes than the total data. :issue:`7799` by - `Srivatsan Ramesh`_ - -- Fix a bug where :class:`sklearn.feature_selection.SelectFdr` did not - exactly implement Benjamini-Hochberg procedure. It formerly may have - selected fewer features than it should. - :issue:`7490` by :user:`Peng Meng `. - -- :class:`sklearn.manifold.LocallyLinearEmbedding` now correctly handles - integer inputs. :issue:`6282` by `Jake Vanderplas`_. - -- The ``min_weight_fraction_leaf`` parameter of tree-based classifiers and - regressors now assumes uniform sample weights by default if the - ``sample_weight`` argument is not passed to the ``fit`` function. - Previously, the parameter was silently ignored. :issue:`7301` - by :user:`Nelson Liu `. - -- Numerical issue with :class:`linear_model.RidgeCV` on centered data when - `n_features > n_samples`. :issue:`6178` by `Bertrand Thirion`_ - -- Tree splitting criterion classes' cloning/pickling is now memory safe - :issue:`7680` by :user:`Ibraim Ganiev `. - -- Fixed a bug where :class:`decomposition.NMF` sets its ``n_iters_`` - attribute in `transform()`. :issue:`7553` by :user:`Ekaterina - Krivich `. - -- :class:`sklearn.linear_model.LogisticRegressionCV` now correctly handles - string labels. :issue:`5874` by `Raghav RV`_. - -- Fixed a bug where :func:`sklearn.model_selection.train_test_split` raised - an error when ``stratify`` is a list of string labels. :issue:`7593` by - `Raghav RV`_. - -- Fixed a bug where :class:`sklearn.model_selection.GridSearchCV` and - :class:`sklearn.model_selection.RandomizedSearchCV` were not pickleable - because of a pickling bug in ``np.ma.MaskedArray``. :issue:`7594` by - `Raghav RV`_. - -- All cross-validation utilities in :mod:`sklearn.model_selection` now - permit one time cross-validation splitters for the ``cv`` parameter. Also - non-deterministic cross-validation splitters (where multiple calls to - ``split`` produce dissimilar splits) can be used as ``cv`` parameter. - The :class:`sklearn.model_selection.GridSearchCV` will cross-validate each - parameter setting on the split produced by the first ``split`` call - to the cross-validation splitter. :issue:`7660` by `Raghav RV`_. - -- Fix bug where :meth:`preprocessing.MultiLabelBinarizer.fit_transform` - returned an invalid CSR matrix. - :issue:`7750` by :user:`CJ Carey `. - -- Fixed a bug where :func:`metrics.pairwise.cosine_distances` could return a - small negative distance. :issue:`7732` by :user:`Artsion `. - -API changes summary -------------------- - -Trees and forests - -- The ``min_weight_fraction_leaf`` parameter of tree-based classifiers and - regressors now assumes uniform sample weights by default if the - ``sample_weight`` argument is not passed to the ``fit`` function. - Previously, the parameter was silently ignored. :issue:`7301` by :user:`Nelson - Liu `. - -- Tree splitting criterion classes' cloning/pickling is now memory safe. - :issue:`7680` by :user:`Ibraim Ganiev `. - - -Linear, kernelized and related models - -- Length of ``explained_variance_ratio`` of - :class:`discriminant_analysis.LinearDiscriminantAnalysis` - changed for both Eigen and SVD solvers. The attribute has now a length - of min(n_components, n_classes - 1). :issue:`7632` - by :user:`JPFrancoia ` - -- Numerical issue with :class:`linear_model.RidgeCV` on centered data when - ``n_features > n_samples``. :issue:`6178` by `Bertrand Thirion`_ - -.. _changes_0_18: - -Version 0.18 -============ - -**September 28, 2016** - -.. topic:: Last release with Python 2.6 support - - Scikit-learn 0.18 will be the last version of scikit-learn to support Python 2.6. - Later versions of scikit-learn will require Python 2.7 or above. - -.. _model_selection_changes: - -Model Selection Enhancements and API Changes --------------------------------------------- - -- **The model_selection module** - - The new module :mod:`sklearn.model_selection`, which groups together the - functionalities of formerly :mod:`sklearn.cross_validation`, - :mod:`sklearn.grid_search` and :mod:`sklearn.learning_curve`, introduces new - possibilities such as nested cross-validation and better manipulation of - parameter searches with Pandas. - - Many things will stay the same but there are some key differences. Read - below to know more about the changes. - -- **Data-independent CV splitters enabling nested cross-validation** - - The new cross-validation splitters, defined in the - :mod:`sklearn.model_selection`, are no longer initialized with any - data-dependent parameters such as ``y``. Instead they expose a - :func:`split` method that takes in the data and yields a generator for the - different splits. - - This change makes it possible to use the cross-validation splitters to - perform nested cross-validation, facilitated by - :class:`model_selection.GridSearchCV` and - :class:`model_selection.RandomizedSearchCV` utilities. - -- **The enhanced cv_results_ attribute** - - The new ``cv_results_`` attribute (of :class:`model_selection.GridSearchCV` - and :class:`model_selection.RandomizedSearchCV`) introduced in lieu of the - ``grid_scores_`` attribute is a dict of 1D arrays with elements in each - array corresponding to the parameter settings (i.e. search candidates). - - The ``cv_results_`` dict can be easily imported into ``pandas`` as a - ``DataFrame`` for exploring the search results. - - The ``cv_results_`` arrays include scores for each cross-validation split - (with keys such as ``'split0_test_score'``), as well as their mean - (``'mean_test_score'``) and standard deviation (``'std_test_score'``). - - The ranks for the search candidates (based on their mean - cross-validation score) is available at ``cv_results_['rank_test_score']``. - - The parameter values for each parameter is stored separately as numpy - masked object arrays. The value, for that search candidate, is masked if - the corresponding parameter is not applicable. Additionally a list of all - the parameter dicts are stored at ``cv_results_['params']``. - -- **Parameters n_folds and n_iter renamed to n_splits** - - Some parameter names have changed: - The ``n_folds`` parameter in new :class:`model_selection.KFold`, - :class:`model_selection.GroupKFold` (see below for the name change), - and :class:`model_selection.StratifiedKFold` is now renamed to - ``n_splits``. The ``n_iter`` parameter in - :class:`model_selection.ShuffleSplit`, the new class - :class:`model_selection.GroupShuffleSplit` and - :class:`model_selection.StratifiedShuffleSplit` is now renamed to - ``n_splits``. - -- **Rename of splitter classes which accepts group labels along with data** - - The cross-validation splitters ``LabelKFold``, - ``LabelShuffleSplit``, ``LeaveOneLabelOut`` and ``LeavePLabelOut`` have - been renamed to :class:`model_selection.GroupKFold`, - :class:`model_selection.GroupShuffleSplit`, - :class:`model_selection.LeaveOneGroupOut` and - :class:`model_selection.LeavePGroupsOut` respectively. - - Note the change from singular to plural form in - :class:`model_selection.LeavePGroupsOut`. - -- **Fit parameter labels renamed to groups** - - The ``labels`` parameter in the :func:`split` method of the newly renamed - splitters :class:`model_selection.GroupKFold`, - :class:`model_selection.LeaveOneGroupOut`, - :class:`model_selection.LeavePGroupsOut`, - :class:`model_selection.GroupShuffleSplit` is renamed to ``groups`` - following the new nomenclature of their class names. - -- **Parameter n_labels renamed to n_groups** - - The parameter ``n_labels`` in the newly renamed - :class:`model_selection.LeavePGroupsOut` is changed to ``n_groups``. - -- Training scores and Timing information - - ``cv_results_`` also includes the training scores for each - cross-validation split (with keys such as ``'split0_train_score'``), as - well as their mean (``'mean_train_score'``) and standard deviation - (``'std_train_score'``). To avoid the cost of evaluating training score, - set ``return_train_score=False``. - - Additionally the mean and standard deviation of the times taken to split, - train and score the model across all the cross-validation splits is - available at the key ``'mean_time'`` and ``'std_time'`` respectively. - -Changelog ---------- - -New features -............ - -Classifiers and Regressors - -- The Gaussian Process module has been reimplemented and now offers classification - and regression estimators through :class:`gaussian_process.GaussianProcessClassifier` - and :class:`gaussian_process.GaussianProcessRegressor`. Among other things, the new - implementation supports kernel engineering, gradient-based hyperparameter optimization or - sampling of functions from GP prior and GP posterior. Extensive documentation and - examples are provided. By `Jan Hendrik Metzen`_. - -- Added new supervised learning algorithm: :ref:`Multi-layer Perceptron ` - :issue:`3204` by :user:`Issam H. Laradji ` - -- Added :class:`linear_model.HuberRegressor`, a linear model robust to outliers. - :issue:`5291` by `Manoj Kumar`_. - -- Added the :class:`multioutput.MultiOutputRegressor` meta-estimator. It - converts single output regressors to multi-output regressors by fitting - one regressor per output. By :user:`Tim Head `. - -Other estimators - -- New :class:`mixture.GaussianMixture` and :class:`mixture.BayesianGaussianMixture` - replace former mixture models, employing faster inference - for sounder results. :issue:`7295` by :user:`Wei Xue ` and - :user:`Thierry Guillemot `. - -- Class :class:`decomposition.RandomizedPCA` is now factored into :class:`decomposition.PCA` - and it is available calling with parameter ``svd_solver='randomized'``. - The default number of ``n_iter`` for ``'randomized'`` has changed to 4. The old - behavior of PCA is recovered by ``svd_solver='full'``. An additional solver - calls ``arpack`` and performs truncated (non-randomized) SVD. By default, - the best solver is selected depending on the size of the input and the - number of components requested. :issue:`5299` by :user:`Giorgio Patrini `. - -- Added two functions for mutual information estimation: - :func:`feature_selection.mutual_info_classif` and - :func:`feature_selection.mutual_info_regression`. These functions can be - used in :class:`feature_selection.SelectKBest` and - :class:`feature_selection.SelectPercentile` as score functions. - By :user:`Andrea Bravi ` and :user:`Nikolay Mayorov `. - -- Added the :class:`ensemble.IsolationForest` class for anomaly detection based on - random forests. By `Nicolas Goix`_. - -- Added ``algorithm="elkan"`` to :class:`cluster.KMeans` implementing - Elkan's fast K-Means algorithm. By `Andreas Müller`_. - -Model selection and evaluation - -- Added :func:`metrics.cluster.fowlkes_mallows_score`, the Fowlkes Mallows - Index which measures the similarity of two clusterings of a set of points - By :user:`Arnaud Fouchet ` and :user:`Thierry Guillemot `. - -- Added :func:`metrics.calinski_harabaz_score`, which computes the Calinski - and Harabaz score to evaluate the resulting clustering of a set of points. - By :user:`Arnaud Fouchet ` and :user:`Thierry Guillemot `. - -- Added new cross-validation splitter - :class:`model_selection.TimeSeriesSplit` to handle time series data. - :issue:`6586` by :user:`YenChen Lin ` - -- The cross-validation iterators are replaced by cross-validation splitters - available from :mod:`sklearn.model_selection`, allowing for nested - cross-validation. See :ref:`model_selection_changes` for more information. - :issue:`4294` by `Raghav RV`_. - -Enhancements -............ - -Trees and ensembles - -- Added a new splitting criterion for :class:`tree.DecisionTreeRegressor`, - the mean absolute error. This criterion can also be used in - :class:`ensemble.ExtraTreesRegressor`, - :class:`ensemble.RandomForestRegressor`, and the gradient boosting - estimators. :issue:`6667` by :user:`Nelson Liu `. - -- Added weighted impurity-based early stopping criterion for decision tree - growth. :issue:`6954` by :user:`Nelson Liu ` - -- The random forest, extra tree and decision tree estimators now has a - method ``decision_path`` which returns the decision path of samples in - the tree. By `Arnaud Joly`_. - -- A new example has been added unveiling the decision tree structure. - By `Arnaud Joly`_. - -- Random forest, extra trees, decision trees and gradient boosting estimator - accept the parameter ``min_samples_split`` and ``min_samples_leaf`` - provided as a percentage of the training samples. By :user:`yelite ` and `Arnaud Joly`_. - -- Gradient boosting estimators accept the parameter ``criterion`` to specify - to splitting criterion used in built decision trees. - :issue:`6667` by :user:`Nelson Liu `. - -- The memory footprint is reduced (sometimes greatly) for - :class:`ensemble.bagging.BaseBagging` and classes that inherit from it, - i.e, :class:`ensemble.BaggingClassifier`, - :class:`ensemble.BaggingRegressor`, and :class:`ensemble.IsolationForest`, - by dynamically generating attribute ``estimators_samples_`` only when it is - needed. By :user:`David Staub `. - -- Added ``n_jobs`` and ``sample_weight`` parameters for - :class:`ensemble.VotingClassifier` to fit underlying estimators in parallel. - :issue:`5805` by :user:`Ibraim Ganiev `. - -Linear, kernelized and related models - -- In :class:`linear_model.LogisticRegression`, the SAG solver is now - available in the multinomial case. :issue:`5251` by `Tom Dupre la Tour`_. - -- :class:`linear_model.RANSACRegressor`, :class:`svm.LinearSVC` and - :class:`svm.LinearSVR` now support ``sample_weight``. - By :user:`Imaculate `. - -- Add parameter ``loss`` to :class:`linear_model.RANSACRegressor` to measure the - error on the samples for every trial. By `Manoj Kumar`_. - -- Prediction of out-of-sample events with Isotonic Regression - (:class:`isotonic.IsotonicRegression`) is now much faster (over 1000x in tests with synthetic - data). By :user:`Jonathan Arfa `. - -- Isotonic regression (:class:`isotonic.IsotonicRegression`) now uses a better algorithm to avoid - `O(n^2)` behavior in pathological cases, and is also generally faster - (:issue:`#6691`). By `Antony Lee`_. - -- :class:`naive_bayes.GaussianNB` now accepts data-independent class-priors - through the parameter ``priors``. By :user:`Guillaume Lemaitre `. - -- :class:`linear_model.ElasticNet` and :class:`linear_model.Lasso` - now works with ``np.float32`` input data without converting it - into ``np.float64``. This allows to reduce the memory - consumption. :issue:`6913` by :user:`YenChen Lin `. - -- :class:`semi_supervised.LabelPropagation` and :class:`semi_supervised.LabelSpreading` - now accept arbitrary kernel functions in addition to strings ``knn`` and ``rbf``. - :issue:`5762` by :user:`Utkarsh Upadhyay `. - -Decomposition, manifold learning and clustering - -- Added ``inverse_transform`` function to :class:`decomposition.NMF` to compute - data matrix of original shape. By :user:`Anish Shah `. - -- :class:`cluster.KMeans` and :class:`cluster.MiniBatchKMeans` now works - with ``np.float32`` and ``np.float64`` input data without converting it. - This allows to reduce the memory consumption by using ``np.float32``. - :issue:`6846` by :user:`Sebastian Säger ` and - :user:`YenChen Lin `. - -Preprocessing and feature selection - -- :class:`preprocessing.RobustScaler` now accepts ``quantile_range`` parameter. - :issue:`5929` by :user:`Konstantin Podshumok `. - -- :class:`feature_extraction.FeatureHasher` now accepts string values. - :issue:`6173` by :user:`Ryad Zenine ` and - :user:`Devashish Deshpande `. - -- Keyword arguments can now be supplied to ``func`` in - :class:`preprocessing.FunctionTransformer` by means of the ``kw_args`` - parameter. By `Brian McFee`_. - -- :class:`feature_selection.SelectKBest` and :class:`feature_selection.SelectPercentile` - now accept score functions that take X, y as input and return only the scores. - By :user:`Nikolay Mayorov `. - -Model evaluation and meta-estimators - -- :class:`multiclass.OneVsOneClassifier` and :class:`multiclass.OneVsRestClassifier` - now support ``partial_fit``. By :user:`Asish Panda ` and - :user:`Philipp Dowling `. - -- Added support for substituting or disabling :class:`pipeline.Pipeline` - and :class:`pipeline.FeatureUnion` components using the ``set_params`` - interface that powers :mod:`sklearn.grid_search`. - See :ref:`sphx_glr_auto_examples_plot_compare_reduction.py` - By `Joel Nothman`_ and :user:`Robert McGibbon `. - -- The new ``cv_results_`` attribute of :class:`model_selection.GridSearchCV` - (and :class:`model_selection.RandomizedSearchCV`) can be easily imported - into pandas as a ``DataFrame``. Ref :ref:`model_selection_changes` for - more information. :issue:`6697` by `Raghav RV`_. - -- Generalization of :func:`model_selection.cross_val_predict`. - One can pass method names such as `predict_proba` to be used in the cross - validation framework instead of the default `predict`. - By :user:`Ori Ziv ` and :user:`Sears Merritt `. - -- The training scores and time taken for training followed by scoring for - each search candidate are now available at the ``cv_results_`` dict. - See :ref:`model_selection_changes` for more information. - :issue:`7325` by :user:`Eugene Chen ` and `Raghav RV`_. - -Metrics - -- Added ``labels`` flag to :class:`metrics.log_loss` to explicitly provide - the labels when the number of classes in ``y_true`` and ``y_pred`` differ. - :issue:`7239` by :user:`Hong Guangguo ` with help from - :user:`Mads Jensen ` and :user:`Nelson Liu `. - -- Support sparse contingency matrices in cluster evaluation - (:mod:`metrics.cluster.supervised`) to scale to a large number of - clusters. - :issue:`7419` by :user:`Gregory Stupp ` and `Joel Nothman`_. - -- Add ``sample_weight`` parameter to :func:`metrics.matthews_corrcoef`. - By :user:`Jatin Shah ` and `Raghav RV`_. - -- Speed up :func:`metrics.silhouette_score` by using vectorized operations. - By `Manoj Kumar`_. - -- Add ``sample_weight`` parameter to :func:`metrics.confusion_matrix`. - By :user:`Bernardo Stein `. - -Miscellaneous - -- Added ``n_jobs`` parameter to :class:`feature_selection.RFECV` to compute - the score on the test folds in parallel. By `Manoj Kumar`_ - -- Codebase does not contain C/C++ cython generated files: they are - generated during build. Distribution packages will still contain generated - C/C++ files. By :user:`Arthur Mensch `. - -- Reduce the memory usage for 32-bit float input arrays of - :func:`utils.sparse_func.mean_variance_axis` and - :func:`utils.sparse_func.incr_mean_variance_axis` by supporting cython - fused types. By :user:`YenChen Lin `. - -- The :func:`ignore_warnings` now accept a category argument to ignore only - the warnings of a specified type. By :user:`Thierry Guillemot `. - -- Added parameter ``return_X_y`` and return type ``(data, target) : tuple`` option to - :func:`load_iris` dataset - :issue:`7049`, - :func:`load_breast_cancer` dataset - :issue:`7152`, - :func:`load_digits` dataset, - :func:`load_diabetes` dataset, - :func:`load_linnerud` dataset, - :func:`load_boston` dataset - :issue:`7154` by - :user:`Manvendra Singh`. - -- Simplification of the ``clone`` function, deprecate support for estimators - that modify parameters in ``__init__``. :issue:`5540` by `Andreas Müller`_. - -- When unpickling a scikit-learn estimator in a different version than the one - the estimator was trained with, a ``UserWarning`` is raised, see :ref:`the documentation - on model persistence ` for more details. (:issue:`7248`) - By `Andreas Müller`_. - -Bug fixes -......... - -Trees and ensembles - -- Random forest, extra trees, decision trees and gradient boosting - won't accept anymore ``min_samples_split=1`` as at least 2 samples - are required to split a decision tree node. By `Arnaud Joly`_ - -- :class:`ensemble.VotingClassifier` now raises ``NotFittedError`` if ``predict``, - ``transform`` or ``predict_proba`` are called on the non-fitted estimator. - by `Sebastian Raschka`_. - -- Fix bug where :class:`ensemble.AdaBoostClassifier` and - :class:`ensemble.AdaBoostRegressor` would perform poorly if the - ``random_state`` was fixed - (:issue:`7411`). By `Joel Nothman`_. - -- Fix bug in ensembles with randomization where the ensemble would not - set ``random_state`` on base estimators in a pipeline or similar nesting. - (:issue:`7411`). Note, results for :class:`ensemble.BaggingClassifier` - :class:`ensemble.BaggingRegressor`, :class:`ensemble.AdaBoostClassifier` - and :class:`ensemble.AdaBoostRegressor` will now differ from previous - versions. By `Joel Nothman`_. - -Linear, kernelized and related models - -- Fixed incorrect gradient computation for ``loss='squared_epsilon_insensitive'`` in - :class:`linear_model.SGDClassifier` and :class:`linear_model.SGDRegressor` - (:issue:`6764`). By :user:`Wenhua Yang `. - -- Fix bug in :class:`linear_model.LogisticRegressionCV` where - ``solver='liblinear'`` did not accept ``class_weights='balanced``. - (:issue:`6817`). By `Tom Dupre la Tour`_. - -- Fix bug in :class:`neighbors.RadiusNeighborsClassifier` where an error - occurred when there were outliers being labelled and a weight function - specified (:issue:`6902`). By - `LeonieBorne `_. - -- Fix :class:`linear_model.ElasticNet` sparse decision function to match - output with dense in the multioutput case. - -Decomposition, manifold learning and clustering - -- :class:`decomposition.RandomizedPCA` default number of `iterated_power` is 4 instead of 3. - :issue:`5141` by :user:`Giorgio Patrini `. - -- :func:`utils.extmath.randomized_svd` performs 4 power iterations by default, instead or 0. - In practice this is enough for obtaining a good approximation of the - true eigenvalues/vectors in the presence of noise. When `n_components` is - small (``< .1 * min(X.shape)``) `n_iter` is set to 7, unless the user specifies - a higher number. This improves precision with few components. - :issue:`5299` by :user:`Giorgio Patrini`. - -- Whiten/non-whiten inconsistency between components of :class:`decomposition.PCA` - and :class:`decomposition.RandomizedPCA` (now factored into PCA, see the - New features) is fixed. `components_` are stored with no whitening. - :issue:`5299` by :user:`Giorgio Patrini `. - -- Fixed bug in :func:`manifold.spectral_embedding` where diagonal of unnormalized - Laplacian matrix was incorrectly set to 1. :issue:`4995` by :user:`Peter Fischer `. - -- Fixed incorrect initialization of :func:`utils.arpack.eigsh` on all - occurrences. Affects :class:`cluster.bicluster.SpectralBiclustering`, - :class:`decomposition.KernelPCA`, :class:`manifold.LocallyLinearEmbedding`, - and :class:`manifold.SpectralEmbedding` (:issue:`5012`). By - :user:`Peter Fischer `. - -- Attribute ``explained_variance_ratio_`` calculated with the SVD solver - of :class:`discriminant_analysis.LinearDiscriminantAnalysis` now returns - correct results. By :user:`JPFrancoia ` - -Preprocessing and feature selection - -- :func:`preprocessing.data._transform_selected` now always passes a copy - of ``X`` to transform function when ``copy=True`` (:issue:`7194`). By `Caio - Oliveira `_. - -Model evaluation and meta-estimators - -- :class:`model_selection.StratifiedKFold` now raises error if all n_labels - for individual classes is less than n_folds. - :issue:`6182` by :user:`Devashish Deshpande `. - -- Fixed bug in :class:`model_selection.StratifiedShuffleSplit` - where train and test sample could overlap in some edge cases, - see :issue:`6121` for - more details. By `Loic Esteve`_. - -- Fix in :class:`sklearn.model_selection.StratifiedShuffleSplit` to - return splits of size ``train_size`` and ``test_size`` in all cases - (:issue:`6472`). By `Andreas Müller`_. - -- Cross-validation of :class:`OneVsOneClassifier` and - :class:`OneVsRestClassifier` now works with precomputed kernels. - :issue:`7350` by :user:`Russell Smith `. - -- Fix incomplete ``predict_proba`` method delegation from - :class:`model_selection.GridSearchCV` to - :class:`linear_model.SGDClassifier` (:issue:`7159`) - by `Yichuan Liu `_. - -Metrics - -- Fix bug in :func:`metrics.silhouette_score` in which clusters of - size 1 were incorrectly scored. They should get a score of 0. - By `Joel Nothman`_. - -- Fix bug in :func:`metrics.silhouette_samples` so that it now works with - arbitrary labels, not just those ranging from 0 to n_clusters - 1. - -- Fix bug where expected and adjusted mutual information were incorrect if - cluster contingency cells exceeded ``2**16``. By `Joel Nothman`_. - -- :func:`metrics.pairwise.pairwise_distances` now converts arrays to - boolean arrays when required in ``scipy.spatial.distance``. - :issue:`5460` by `Tom Dupre la Tour`_. - -- Fix sparse input support in :func:`metrics.silhouette_score` as well as - example examples/text/document_clustering.py. By :user:`YenChen Lin `. - -- :func:`metrics.roc_curve` and :func:`metrics.precision_recall_curve` no - longer round ``y_score`` values when creating ROC curves; this was causing - problems for users with very small differences in scores (:issue:`7353`). - -Miscellaneous - -- :func:`model_selection.tests._search._check_param_grid` now works correctly with all types - that extends/implements `Sequence` (except string), including range (Python 3.x) and xrange - (Python 2.x). :issue:`7323` by Viacheslav Kovalevskyi. - -- :func:`utils.extmath.randomized_range_finder` is more numerically stable when many - power iterations are requested, since it applies LU normalization by default. - If ``n_iter<2`` numerical issues are unlikely, thus no normalization is applied. - Other normalization options are available: ``'none', 'LU'`` and ``'QR'``. - :issue:`5141` by :user:`Giorgio Patrini `. - -- Fix a bug where some formats of ``scipy.sparse`` matrix, and estimators - with them as parameters, could not be passed to :func:`base.clone`. - By `Loic Esteve`_. - -- :func:`datasets.load_svmlight_file` now is able to read long int QID values. - :issue:`7101` by :user:`Ibraim Ganiev `. - - -API changes summary -------------------- - -Linear, kernelized and related models - -- ``residual_metric`` has been deprecated in :class:`linear_model.RANSACRegressor`. - Use ``loss`` instead. By `Manoj Kumar`_. - -- Access to public attributes ``.X_`` and ``.y_`` has been deprecated in - :class:`isotonic.IsotonicRegression`. By :user:`Jonathan Arfa `. - -Decomposition, manifold learning and clustering - -- The old :class:`mixture.DPGMM` is deprecated in favor of the new - :class:`mixture.BayesianGaussianMixture` (with the parameter - ``weight_concentration_prior_type='dirichlet_process'``). - The new class solves the computational - problems of the old class and computes the Gaussian mixture with a - Dirichlet process prior faster than before. - :issue:`7295` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. - -- The old :class:`mixture.VBGMM` is deprecated in favor of the new - :class:`mixture.BayesianGaussianMixture` (with the parameter - ``weight_concentration_prior_type='dirichlet_distribution'``). - The new class solves the computational - problems of the old class and computes the Variational Bayesian Gaussian - mixture faster than before. - :issue:`6651` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. - -- The old :class:`mixture.GMM` is deprecated in favor of the new - :class:`mixture.GaussianMixture`. The new class computes the Gaussian mixture - faster than before and some of computational problems have been solved. - :issue:`6666` by :user:`Wei Xue ` and :user:`Thierry Guillemot `. - -Model evaluation and meta-estimators - -- The :mod:`sklearn.cross_validation`, :mod:`sklearn.grid_search` and - :mod:`sklearn.learning_curve` have been deprecated and the classes and - functions have been reorganized into the :mod:`sklearn.model_selection` - module. Ref :ref:`model_selection_changes` for more information. - :issue:`4294` by `Raghav RV`_. - -- The ``grid_scores_`` attribute of :class:`model_selection.GridSearchCV` - and :class:`model_selection.RandomizedSearchCV` is deprecated in favor of - the attribute ``cv_results_``. - Ref :ref:`model_selection_changes` for more information. - :issue:`6697` by `Raghav RV`_. - -- The parameters ``n_iter`` or ``n_folds`` in old CV splitters are replaced - by the new parameter ``n_splits`` since it can provide a consistent - and unambiguous interface to represent the number of train-test splits. - :issue:`7187` by :user:`YenChen Lin `. - -- ``classes`` parameter was renamed to ``labels`` in - :func:`metrics.hamming_loss`. :issue:`7260` by :user:`Sebastián Vanrell `. - -- The splitter classes ``LabelKFold``, ``LabelShuffleSplit``, - ``LeaveOneLabelOut`` and ``LeavePLabelsOut`` are renamed to - :class:`model_selection.GroupKFold`, - :class:`model_selection.GroupShuffleSplit`, - :class:`model_selection.LeaveOneGroupOut` - and :class:`model_selection.LeavePGroupsOut` respectively. - Also the parameter ``labels`` in the :func:`split` method of the newly - renamed splitters :class:`model_selection.LeaveOneGroupOut` and - :class:`model_selection.LeavePGroupsOut` is renamed to - ``groups``. Additionally in :class:`model_selection.LeavePGroupsOut`, - the parameter ``n_labels`` is renamed to ``n_groups``. - :issue:`6660` by `Raghav RV`_. - -- Error and loss names for ``scoring`` parameters are now prefixed by - ``'neg_'``, such as ``neg_mean_squared_error``. The unprefixed versions - are deprecated and will be removed in version 0.20. - :issue:`7261` by :user:`Tim Head `. - -Code Contributors ------------------ -Aditya Joshi, Alejandro, Alexander Fabisch, Alexander Loginov, Alexander -Minyushkin, Alexander Rudy, Alexandre Abadie, Alexandre Abraham, Alexandre -Gramfort, Alexandre Saint, alexfields, Alvaro Ulloa, alyssaq, Amlan Kar, -Andreas Mueller, andrew giessel, Andrew Jackson, Andrew McCulloh, Andrew -Murray, Anish Shah, Arafat, Archit Sharma, Ariel Rokem, Arnaud Joly, Arnaud -Rachez, Arthur Mensch, Ash Hoover, asnt, b0noI, Behzad Tabibian, Bernardo, -Bernhard Kratzwald, Bhargav Mangipudi, blakeflei, Boyuan Deng, Brandon Carter, -Brett Naul, Brian McFee, Caio Oliveira, Camilo Lamus, Carol Willing, Cass, -CeShine Lee, Charles Truong, Chyi-Kwei Yau, CJ Carey, codevig, Colin Ni, Dan -Shiebler, Daniel, Daniel Hnyk, David Ellis, David Nicholson, David Staub, David -Thaler, David Warshaw, Davide Lasagna, Deborah, definitelyuncertain, Didi -Bar-Zev, djipey, dsquareindia, edwinENSAE, Elias Kuthe, Elvis DOHMATOB, Ethan -White, Fabian Pedregosa, Fabio Ticconi, fisache, Florian Wilhelm, Francis, -Francis O'Donovan, Gael Varoquaux, Ganiev Ibraim, ghg, Gilles Louppe, Giorgio -Patrini, Giovanni Cherubin, Giovanni Lanzani, Glenn Qian, Gordon -Mohr, govin-vatsan, Graham Clenaghan, Greg Reda, Greg Stupp, Guillaume -Lemaitre, Gustav Mörtberg, halwai, Harizo Rajaona, Harry Mavroforakis, -hashcode55, hdmetor, Henry Lin, Hobson Lane, Hugo Bowne-Anderson, -Igor Andriushchenko, Imaculate, Inki Hwang, Isaac Sijaranamual, -Ishank Gulati, Issam Laradji, Iver Jordal, jackmartin, Jacob Schreiber, Jake -Vanderplas, James Fiedler, James Routley, Jan Zikes, Janna Brettingen, jarfa, Jason -Laska, jblackburne, jeff levesque, Jeffrey Blackburne, Jeffrey04, Jeremy Hintz, -jeremynixon, Jeroen, Jessica Yung, Jill-Jênn Vie, Jimmy Jia, Jiyuan Qian, Joel -Nothman, johannah, John, John Boersma, John Kirkham, John Moeller, -jonathan.striebel, joncrall, Jordi, Joseph Munoz, Joshua Cook, JPFrancoia, -jrfiedler, JulianKahnert, juliathebrave, kaichogami, KamalakerDadi, Kenneth -Lyons, Kevin Wang, kingjr, kjell, Konstantin Podshumok, Kornel Kielczewski, -Krishna Kalyan, krishnakalyan3, Kvle Putnam, Kyle Jackson, Lars Buitinck, -ldavid, LeiG, LeightonZhang, Leland McInnes, Liang-Chi Hsieh, Lilian Besson, -lizsz, Loic Esteve, Louis Tiao, Léonie Borne, Mads Jensen, Maniteja Nandana, -Manoj Kumar, Manvendra Singh, Marco, Mario Krell, Mark Bao, Mark Szepieniec, -Martin Madsen, MartinBpr, MaryanMorel, Massil, Matheus, Mathieu Blondel, -Mathieu Dubois, Matteo, Matthias Ekman, Max Moroz, Michael Scherer, michiaki -ariga, Mikhail Korobov, Moussa Taifi, mrandrewandrade, Mridul Seth, nadya-p, -Naoya Kanai, Nate George, Nelle Varoquaux, Nelson Liu, Nick James, -NickleDave, Nico, Nicolas Goix, Nikolay Mayorov, ningchi, nlathia, -okbalefthanded, Okhlopkov, Olivier Grisel, Panos Louridas, Paul Strickland, -Perrine Letellier, pestrickland, Peter Fischer, Pieter, Ping-Yao, Chang, -practicalswift, Preston Parry, Qimu Zheng, Rachit Kansal, Raghav RV, -Ralf Gommers, Ramana.S, Rammig, Randy Olson, Rob Alexander, Robert Lutz, -Robin Schucker, Rohan Jain, Ruifeng Zheng, Ryan Yu, Rémy Léone, saihttam, -Saiwing Yeung, Sam Shleifer, Samuel St-Jean, Sartaj Singh, Sasank Chilamkurthy, -saurabh.bansod, Scott Andrews, Scott Lowe, seales, Sebastian Raschka, Sebastian -Saeger, Sebastián Vanrell, Sergei Lebedev, shagun Sodhani, shanmuga cv, -Shashank Shekhar, shawpan, shengxiduan, Shota, shuckle16, Skipper Seabold, -sklearn-ci, SmedbergM, srvanrell, Sébastien Lerique, Taranjeet, themrmax, -Thierry, Thierry Guillemot, Thomas, Thomas Hallock, Thomas Moreau, Tim Head, -tKammy, toastedcornflakes, Tom, TomDLT, Toshihiro Kamishima, tracer0tong, Trent -Hauck, trevorstephens, Tue Vo, Varun, Varun Jewalikar, Viacheslav, Vighnesh -Birodkar, Vikram, Villu Ruusmann, Vinayak Mehta, walter, waterponey, Wenhua -Yang, Wenjian Huang, Will Welch, wyseguy7, xyguo, yanlend, Yaroslav Halchenko, -yelite, Yen, YenChenLin, Yichuan Liu, Yoav Ram, Yoshiki, Zheng RuiFeng, zivori, Óscar Nájera - -.. currentmodule:: sklearn - -.. _changes_0_17_1: - -Version 0.17.1 -============== - -**February 18, 2016** - -Changelog ---------- - -Bug fixes -......... - - -- Upgrade vendored joblib to version 0.9.4 that fixes an important bug in - ``joblib.Parallel`` that can silently yield to wrong results when working - on datasets larger than 1MB: - https://github.com/joblib/joblib/blob/0.9.4/CHANGES.rst - -- Fixed reading of Bunch pickles generated with scikit-learn - version <= 0.16. This can affect users who have already - downloaded a dataset with scikit-learn 0.16 and are loading it - with scikit-learn 0.17. See :issue:`6196` for - how this affected :func:`datasets.fetch_20newsgroups`. By `Loic - Esteve`_. - -- Fixed a bug that prevented using ROC AUC score to perform grid search on - several CPU / cores on large arrays. See :issue:`6147` - By `Olivier Grisel`_. - -- Fixed a bug that prevented to properly set the ``presort`` parameter - in :class:`ensemble.GradientBoostingRegressor`. See :issue:`5857` - By Andrew McCulloh. - -- Fixed a joblib error when evaluating the perplexity of a - :class:`decomposition.LatentDirichletAllocation` model. See :issue:`6258` - By Chyi-Kwei Yau. - - -.. _changes_0_17: - -Version 0.17 -============ - -**November 5, 2015** - -Changelog ---------- - -New features -............ - -- All the Scaler classes but :class:`preprocessing.RobustScaler` can be fitted online by - calling `partial_fit`. By :user:`Giorgio Patrini `. - -- The new class :class:`ensemble.VotingClassifier` implements a - "majority rule" / "soft voting" ensemble classifier to combine - estimators for classification. By `Sebastian Raschka`_. - -- The new class :class:`preprocessing.RobustScaler` provides an - alternative to :class:`preprocessing.StandardScaler` for feature-wise - centering and range normalization that is robust to outliers. - By :user:`Thomas Unterthiner `. - -- The new class :class:`preprocessing.MaxAbsScaler` provides an - alternative to :class:`preprocessing.MinMaxScaler` for feature-wise - range normalization when the data is already centered or sparse. - By :user:`Thomas Unterthiner `. - -- The new class :class:`preprocessing.FunctionTransformer` turns a Python - function into a ``Pipeline``-compatible transformer object. - By Joe Jevnik. - -- The new classes :class:`cross_validation.LabelKFold` and - :class:`cross_validation.LabelShuffleSplit` generate train-test folds, - respectively similar to :class:`cross_validation.KFold` and - :class:`cross_validation.ShuffleSplit`, except that the folds are - conditioned on a label array. By `Brian McFee`_, :user:`Jean - Kossaifi ` and `Gilles Louppe`_. - -- :class:`decomposition.LatentDirichletAllocation` implements the Latent - Dirichlet Allocation topic model with online variational - inference. By :user:`Chyi-Kwei Yau `, with code based on an implementation - by Matt Hoffman. (:issue:`3659`) - -- The new solver ``sag`` implements a Stochastic Average Gradient descent - and is available in both :class:`linear_model.LogisticRegression` and - :class:`linear_model.Ridge`. This solver is very efficient for large - datasets. By :user:`Danny Sullivan ` and `Tom Dupre la Tour`_. - (:issue:`4738`) - -- The new solver ``cd`` implements a Coordinate Descent in - :class:`decomposition.NMF`. Previous solver based on Projected Gradient is - still available setting new parameter ``solver`` to ``pg``, but is - deprecated and will be removed in 0.19, along with - :class:`decomposition.ProjectedGradientNMF` and parameters ``sparseness``, - ``eta``, ``beta`` and ``nls_max_iter``. New parameters ``alpha`` and - ``l1_ratio`` control L1 and L2 regularization, and ``shuffle`` adds a - shuffling step in the ``cd`` solver. - By `Tom Dupre la Tour`_ and `Mathieu Blondel`_. - -Enhancements -............ -- :class:`manifold.TSNE` now supports approximate optimization via the - Barnes-Hut method, leading to much faster fitting. By Christopher Erick Moody. - (:issue:`4025`) - -- :class:`cluster.mean_shift_.MeanShift` now supports parallel execution, - as implemented in the ``mean_shift`` function. By :user:`Martino - Sorbaro `. - -- :class:`naive_bayes.GaussianNB` now supports fitting with ``sample_weight``. - By `Jan Hendrik Metzen`_. - -- :class:`dummy.DummyClassifier` now supports a prior fitting strategy. - By `Arnaud Joly`_. - -- Added a ``fit_predict`` method for :class:`mixture.GMM` and subclasses. - By :user:`Cory Lorenz `. - -- Added the :func:`metrics.label_ranking_loss` metric. - By `Arnaud Joly`_. - -- Added the :func:`metrics.cohen_kappa_score` metric. - -- Added a ``warm_start`` constructor parameter to the bagging ensemble - models to increase the size of the ensemble. By :user:`Tim Head `. - -- Added option to use multi-output regression metrics without averaging. - By Konstantin Shmelkov and :user:`Michael Eickenberg`. - -- Added ``stratify`` option to :func:`cross_validation.train_test_split` - for stratified splitting. By Miroslav Batchkarov. - -- The :func:`tree.export_graphviz` function now supports aesthetic - improvements for :class:`tree.DecisionTreeClassifier` and - :class:`tree.DecisionTreeRegressor`, including options for coloring nodes - by their majority class or impurity, showing variable names, and using - node proportions instead of raw sample counts. By `Trevor Stephens`_. - -- Improved speed of ``newton-cg`` solver in - :class:`linear_model.LogisticRegression`, by avoiding loss computation. - By `Mathieu Blondel`_ and `Tom Dupre la Tour`_. - -- The ``class_weight="auto"`` heuristic in classifiers supporting - ``class_weight`` was deprecated and replaced by the ``class_weight="balanced"`` - option, which has a simpler formula and interpretation. - By `Hanna Wallach`_ and `Andreas Müller`_. - -- Add ``class_weight`` parameter to automatically weight samples by class - frequency for :class:`linear_model.PassiveAgressiveClassifier`. By - `Trevor Stephens`_. - -- Added backlinks from the API reference pages to the user guide. By - `Andreas Müller`_. - -- The ``labels`` parameter to :func:`sklearn.metrics.f1_score`, - :func:`sklearn.metrics.fbeta_score`, - :func:`sklearn.metrics.recall_score` and - :func:`sklearn.metrics.precision_score` has been extended. - It is now possible to ignore one or more labels, such as where - a multiclass problem has a majority class to ignore. By `Joel Nothman`_. - -- Add ``sample_weight`` support to :class:`linear_model.RidgeClassifier`. - By `Trevor Stephens`_. - -- Provide an option for sparse output from - :func:`sklearn.metrics.pairwise.cosine_similarity`. By - :user:`Jaidev Deshpande `. - -- Add :func:`minmax_scale` to provide a function interface for - :class:`MinMaxScaler`. By :user:`Thomas Unterthiner `. - -- ``dump_svmlight_file`` now handles multi-label datasets. - By Chih-Wei Chang. - -- RCV1 dataset loader (:func:`sklearn.datasets.fetch_rcv1`). - By `Tom Dupre la Tour`_. - -- The "Wisconsin Breast Cancer" classical two-class classification dataset - is now included in scikit-learn, available with - :func:`sklearn.dataset.load_breast_cancer`. - -- Upgraded to joblib 0.9.3 to benefit from the new automatic batching of - short tasks. This makes it possible for scikit-learn to benefit from - parallelism when many very short tasks are executed in parallel, for - instance by the :class:`grid_search.GridSearchCV` meta-estimator - with ``n_jobs > 1`` used with a large grid of parameters on a small - dataset. By `Vlad Niculae`_, `Olivier Grisel`_ and `Loic Esteve`_. - -- For more details about changes in joblib 0.9.3 see the release notes: - https://github.com/joblib/joblib/blob/master/CHANGES.rst#release-093 - -- Improved speed (3 times per iteration) of - :class:`decomposition.DictLearning` with coordinate descent method - from :class:`linear_model.Lasso`. By :user:`Arthur Mensch `. - -- Parallel processing (threaded) for queries of nearest neighbors - (using the ball-tree) by Nikolay Mayorov. - -- Allow :func:`datasets.make_multilabel_classification` to output - a sparse ``y``. By Kashif Rasul. - -- :class:`cluster.DBSCAN` now accepts a sparse matrix of precomputed - distances, allowing memory-efficient distance precomputation. By - `Joel Nothman`_. - -- :class:`tree.DecisionTreeClassifier` now exposes an ``apply`` method - for retrieving the leaf indices samples are predicted as. By - :user:`Daniel Galvez ` and `Gilles Louppe`_. - -- Speed up decision tree regressors, random forest regressors, extra trees - regressors and gradient boosting estimators by computing a proxy - of the impurity improvement during the tree growth. The proxy quantity is - such that the split that maximizes this value also maximizes the impurity - improvement. By `Arnaud Joly`_, :user:`Jacob Schreiber ` - and `Gilles Louppe`_. - -- Speed up tree based methods by reducing the number of computations needed - when computing the impurity measure taking into account linear - relationship of the computed statistics. The effect is particularly - visible with extra trees and on datasets with categorical or sparse - features. By `Arnaud Joly`_. - -- :class:`ensemble.GradientBoostingRegressor` and - :class:`ensemble.GradientBoostingClassifier` now expose an ``apply`` - method for retrieving the leaf indices each sample ends up in under - each try. By :user:`Jacob Schreiber `. - -- Add ``sample_weight`` support to :class:`linear_model.LinearRegression`. - By Sonny Hu. (:issue:`#4881`) - -- Add ``n_iter_without_progress`` to :class:`manifold.TSNE` to control - the stopping criterion. By Santi Villalba. (:issue:`5186`) - -- Added optional parameter ``random_state`` in :class:`linear_model.Ridge` - , to set the seed of the pseudo random generator used in ``sag`` solver. By `Tom Dupre la Tour`_. - -- Added optional parameter ``warm_start`` in - :class:`linear_model.LogisticRegression`. If set to True, the solvers - ``lbfgs``, ``newton-cg`` and ``sag`` will be initialized with the - coefficients computed in the previous fit. By `Tom Dupre la Tour`_. - -- Added ``sample_weight`` support to :class:`linear_model.LogisticRegression` for - the ``lbfgs``, ``newton-cg``, and ``sag`` solvers. By `Valentin Stolbunov`_. - Support added to the ``liblinear`` solver. By `Manoj Kumar`_. - -- Added optional parameter ``presort`` to :class:`ensemble.GradientBoostingRegressor` - and :class:`ensemble.GradientBoostingClassifier`, keeping default behavior - the same. This allows gradient boosters to turn off presorting when building - deep trees or using sparse data. By :user:`Jacob Schreiber `. - -- Altered :func:`metrics.roc_curve` to drop unnecessary thresholds by - default. By :user:`Graham Clenaghan `. - -- Added :class:`feature_selection.SelectFromModel` meta-transformer which can - be used along with estimators that have `coef_` or `feature_importances_` - attribute to select important features of the input data. By - :user:`Maheshakya Wijewardena `, `Joel Nothman`_ and `Manoj Kumar`_. - -- Added :func:`metrics.pairwise.laplacian_kernel`. By `Clyde Fare `_. - -- :class:`covariance.GraphLasso` allows separate control of the convergence criterion - for the Elastic-Net subproblem via the ``enet_tol`` parameter. - -- Improved verbosity in :class:`decomposition.DictionaryLearning`. - -- :class:`ensemble.RandomForestClassifier` and - :class:`ensemble.RandomForestRegressor` no longer explicitly store the - samples used in bagging, resulting in a much reduced memory footprint for - storing random forest models. - -- Added ``positive`` option to :class:`linear_model.Lars` and - :func:`linear_model.lars_path` to force coefficients to be positive. - (:issue:`5131`) - -- Added the ``X_norm_squared`` parameter to :func:`metrics.pairwise.euclidean_distances` - to provide precomputed squared norms for ``X``. - -- Added the ``fit_predict`` method to :class:`pipeline.Pipeline`. - -- Added the :func:`preprocessing.min_max_scale` function. - -Bug fixes -......... - -- Fixed non-determinism in :class:`dummy.DummyClassifier` with sparse - multi-label output. By `Andreas Müller`_. - -- Fixed the output shape of :class:`linear_model.RANSACRegressor` to - ``(n_samples, )``. By `Andreas Müller`_. - -- Fixed bug in :class:`decomposition.DictLearning` when ``n_jobs < 0``. By - `Andreas Müller`_. - -- Fixed bug where :class:`grid_search.RandomizedSearchCV` could consume a - lot of memory for large discrete grids. By `Joel Nothman`_. - -- Fixed bug in :class:`linear_model.LogisticRegressionCV` where `penalty` was ignored - in the final fit. By `Manoj Kumar`_. - -- Fixed bug in :class:`ensemble.forest.ForestClassifier` while computing - oob_score and X is a sparse.csc_matrix. By :user:`Ankur Ankan `. - -- All regressors now consistently handle and warn when given ``y`` that is of - shape ``(n_samples, 1)``. By `Andreas Müller`_ and Henry Lin. - (:issue:`5431`) - -- Fix in :class:`cluster.KMeans` cluster reassignment for sparse input by - `Lars Buitinck`_. - -- Fixed a bug in :class:`lda.LDA` that could cause asymmetric covariance - matrices when using shrinkage. By `Martin Billinger`_. - -- Fixed :func:`cross_validation.cross_val_predict` for estimators with - sparse predictions. By Buddha Prakash. - -- Fixed the ``predict_proba`` method of :class:`linear_model.LogisticRegression` - to use soft-max instead of one-vs-rest normalization. By `Manoj Kumar`_. - (:issue:`5182`) - -- Fixed the :func:`partial_fit` method of :class:`linear_model.SGDClassifier` - when called with ``average=True``. By :user:`Andrew Lamb `. - (:issue:`5282`) - -- Dataset fetchers use different filenames under Python 2 and Python 3 to - avoid pickling compatibility issues. By `Olivier Grisel`_. - (:issue:`5355`) - -- Fixed a bug in :class:`naive_bayes.GaussianNB` which caused classification - results to depend on scale. By `Jake Vanderplas`_. - -- Fixed temporarily :class:`linear_model.Ridge`, which was incorrect - when fitting the intercept in the case of sparse data. The fix - automatically changes the solver to 'sag' in this case. - :issue:`5360` by `Tom Dupre la Tour`_. - -- Fixed a performance bug in :class:`decomposition.RandomizedPCA` on data - with a large number of features and fewer samples. (:issue:`4478`) - By `Andreas Müller`_, `Loic Esteve`_ and :user:`Giorgio Patrini `. - -- Fixed bug in :class:`cross_decomposition.PLS` that yielded unstable and - platform dependent output, and failed on `fit_transform`. - By :user:`Arthur Mensch `. - -- Fixes to the ``Bunch`` class used to store datasets. - -- Fixed :func:`ensemble.plot_partial_dependence` ignoring the - ``percentiles`` parameter. - -- Providing a ``set`` as vocabulary in ``CountVectorizer`` no longer - leads to inconsistent results when pickling. - -- Fixed the conditions on when a precomputed Gram matrix needs to - be recomputed in :class:`linear_model.LinearRegression`, - :class:`linear_model.OrthogonalMatchingPursuit`, - :class:`linear_model.Lasso` and :class:`linear_model.ElasticNet`. - -- Fixed inconsistent memory layout in the coordinate descent solver - that affected :class:`linear_model.DictionaryLearning` and - :class:`covariance.GraphLasso`. (:issue:`5337`) - By `Olivier Grisel`_. - -- :class:`manifold.LocallyLinearEmbedding` no longer ignores the ``reg`` - parameter. - -- Nearest Neighbor estimators with custom distance metrics can now be pickled. - (:issue:`4362`) - -- Fixed a bug in :class:`pipeline.FeatureUnion` where ``transformer_weights`` - were not properly handled when performing grid-searches. - -- Fixed a bug in :class:`linear_model.LogisticRegression` and - :class:`linear_model.LogisticRegressionCV` when using - ``class_weight='balanced'```or ``class_weight='auto'``. - By `Tom Dupre la Tour`_. - -- Fixed bug :issue:`5495` when - doing OVR(SVC(decision_function_shape="ovr")). Fixed by - :user:`Elvis Dohmatob `. - - -API changes summary -------------------- -- Attribute `data_min`, `data_max` and `data_range` in - :class:`preprocessing.MinMaxScaler` are deprecated and won't be available - from 0.19. Instead, the class now exposes `data_min_`, `data_max_` - and `data_range_`. By :user:`Giorgio Patrini `. - -- All Scaler classes now have an `scale_` attribute, the feature-wise - rescaling applied by their `transform` methods. The old attribute `std_` - in :class:`preprocessing.StandardScaler` is deprecated and superseded - by `scale_`; it won't be available in 0.19. By :user:`Giorgio Patrini `. - -- :class:`svm.SVC`` and :class:`svm.NuSVC` now have an ``decision_function_shape`` - parameter to make their decision function of shape ``(n_samples, n_classes)`` - by setting ``decision_function_shape='ovr'``. This will be the default behavior - starting in 0.19. By `Andreas Müller`_. - -- Passing 1D data arrays as input to estimators is now deprecated as it - caused confusion in how the array elements should be interpreted - as features or as samples. All data arrays are now expected - to be explicitly shaped ``(n_samples, n_features)``. - By :user:`Vighnesh Birodkar `. - -- :class:`lda.LDA` and :class:`qda.QDA` have been moved to - :class:`discriminant_analysis.LinearDiscriminantAnalysis` and - :class:`discriminant_analysis.QuadraticDiscriminantAnalysis`. - -- The ``store_covariance`` and ``tol`` parameters have been moved from - the fit method to the constructor in - :class:`discriminant_analysis.LinearDiscriminantAnalysis` and the - ``store_covariances`` and ``tol`` parameters have been moved from the - fit method to the constructor in - :class:`discriminant_analysis.QuadraticDiscriminantAnalysis`. - -- Models inheriting from ``_LearntSelectorMixin`` will no longer support the - transform methods. (i.e, RandomForests, GradientBoosting, LogisticRegression, - DecisionTrees, SVMs and SGD related models). Wrap these models around the - metatransfomer :class:`feature_selection.SelectFromModel` to remove - features (according to `coefs_` or `feature_importances_`) - which are below a certain threshold value instead. - -- :class:`cluster.KMeans` re-runs cluster-assignments in case of non-convergence, - to ensure consistency of ``predict(X)`` and ``labels_``. By - :user:`Vighnesh Birodkar `. - -- Classifier and Regressor models are now tagged as such using the - ``_estimator_type`` attribute. - -- Cross-validation iterators always provide indices into training and test set, - not boolean masks. - -- The ``decision_function`` on all regressors was deprecated and will be - removed in 0.19. Use ``predict`` instead. - -- :func:`datasets.load_lfw_pairs` is deprecated and will be removed in 0.19. - Use :func:`datasets.fetch_lfw_pairs` instead. - -- The deprecated ``hmm`` module was removed. - -- The deprecated ``Bootstrap`` cross-validation iterator was removed. - -- The deprecated ``Ward`` and ``WardAgglomerative`` classes have been removed. - Use :class:`clustering.AgglomerativeClustering` instead. - -- :func:`cross_validation.check_cv` is now a public function. - -- The property ``residues_`` of :class:`linear_model.LinearRegression` is deprecated - and will be removed in 0.19. - -- The deprecated ``n_jobs`` parameter of :class:`linear_model.LinearRegression` has been moved - to the constructor. - -- Removed deprecated ``class_weight`` parameter from :class:`linear_model.SGDClassifier`'s ``fit`` - method. Use the construction parameter instead. - -- The deprecated support for the sequence of sequences (or list of lists) multilabel - format was removed. To convert to and from the supported binary - indicator matrix format, use - :class:`MultiLabelBinarizer `. - -- The behavior of calling the ``inverse_transform`` method of ``Pipeline.pipeline`` will - change in 0.19. It will no longer reshape one-dimensional input to two-dimensional input. - -- The deprecated attributes ``indicator_matrix_``, ``multilabel_`` and ``classes_`` of - :class:`preprocessing.LabelBinarizer` were removed. - -- Using ``gamma=0`` in :class:`svm.SVC` and :class:`svm.SVR` to automatically set the - gamma to ``1. / n_features`` is deprecated and will be removed in 0.19. - Use ``gamma="auto"`` instead. - -Code Contributors ------------------ -Aaron Schumacher, Adithya Ganesh, akitty, Alexandre Gramfort, Alexey Grigorev, -Ali Baharev, Allen Riddell, Ando Saabas, Andreas Mueller, Andrew Lamb, Anish -Shah, Ankur Ankan, Anthony Erlinger, Ari Rouvinen, Arnaud Joly, Arnaud Rachez, -Arthur Mensch, banilo, Barmaley.exe, benjaminirving, Boyuan Deng, Brett Naul, -Brian McFee, Buddha Prakash, Chi Zhang, Chih-Wei Chang, Christof Angermueller, -Christoph Gohlke, Christophe Bourguignat, Christopher Erick Moody, Chyi-Kwei -Yau, Cindy Sridharan, CJ Carey, Clyde-fare, Cory Lorenz, Dan Blanchard, Daniel -Galvez, Daniel Kronovet, Danny Sullivan, Data1010, David, David D Lowe, David -Dotson, djipey, Dmitry Spikhalskiy, Donne Martin, Dougal J. Sutherland, Dougal -Sutherland, edson duarte, Eduardo Caro, Eric Larson, Eric Martin, Erich -Schubert, Fernando Carrillo, Frank C. Eckert, Frank Zalkow, Gael Varoquaux, -Ganiev Ibraim, Gilles Louppe, Giorgio Patrini, giorgiop, Graham Clenaghan, -Gryllos Prokopis, gwulfs, Henry Lin, Hsuan-Tien Lin, Immanuel Bayer, Ishank -Gulati, Jack Martin, Jacob Schreiber, Jaidev Deshpande, Jake Vanderplas, Jan -Hendrik Metzen, Jean Kossaifi, Jeffrey04, Jeremy, jfraj, Jiali Mei, -Joe Jevnik, Joel Nothman, John Kirkham, John Wittenauer, Joseph, Joshua Loyal, -Jungkook Park, KamalakerDadi, Kashif Rasul, Keith Goodman, Kian Ho, Konstantin -Shmelkov, Kyler Brown, Lars Buitinck, Lilian Besson, Loic Esteve, Louis Tiao, -maheshakya, Maheshakya Wijewardena, Manoj Kumar, MarkTab marktab.net, Martin -Ku, Martin Spacek, MartinBpr, martinosorb, MaryanMorel, Masafumi Oyamada, -Mathieu Blondel, Matt Krump, Matti Lyra, Maxim Kolganov, mbillinger, mhg, -Michael Heilman, Michael Patterson, Miroslav Batchkarov, Nelle Varoquaux, -Nicolas, Nikolay Mayorov, Olivier Grisel, Omer Katz, Óscar Nájera, Pauli -Virtanen, Peter Fischer, Peter Prettenhofer, Phil Roth, pianomania, Preston -Parry, Raghav RV, Rob Zinkov, Robert Layton, Rohan Ramanath, Saket Choudhary, -Sam Zhang, santi, saurabh.bansod, scls19fr, Sebastian Raschka, Sebastian -Saeger, Shivan Sornarajah, SimonPL, sinhrks, Skipper Seabold, Sonny Hu, sseg, -Stephen Hoover, Steven De Gryze, Steven Seguin, Theodore Vasiloudis, Thomas -Unterthiner, Tiago Freitas Pereira, Tian Wang, Tim Head, Timothy Hopper, -tokoroten, Tom Dupré la Tour, Trevor Stephens, Valentin Stolbunov, Vighnesh -Birodkar, Vinayak Mehta, Vincent, Vincent Michel, vstolbunov, wangz10, Wei Xue, -Yucheng Low, Yury Zhauniarovich, Zac Stewart, zhai_pro, Zichen Wang - -.. _changes_0_1_16: - -Version 0.16.1 -=============== - -**April 14, 2015** - -Changelog ---------- - -Bug fixes -......... - -- Allow input data larger than ``block_size`` in - :class:`covariance.LedoitWolf` by `Andreas Müller`_. - -- Fix a bug in :class:`isotonic.IsotonicRegression` deduplication that - caused unstable result in :class:`calibration.CalibratedClassifierCV` by - `Jan Hendrik Metzen`_. - -- Fix sorting of labels in func:`preprocessing.label_binarize` by Michael Heilman. - -- Fix several stability and convergence issues in - :class:`cross_decomposition.CCA` and - :class:`cross_decomposition.PLSCanonical` by `Andreas Müller`_ - -- Fix a bug in :class:`cluster.KMeans` when ``precompute_distances=False`` - on fortran-ordered data. - -- Fix a speed regression in :class:`ensemble.RandomForestClassifier`'s ``predict`` - and ``predict_proba`` by `Andreas Müller`_. - -- Fix a regression where ``utils.shuffle`` converted lists and dataframes to arrays, by `Olivier Grisel`_ - -.. _changes_0_16: - -Version 0.16 -============ - -**March 26, 2015** - -Highlights ------------ - -- Speed improvements (notably in :class:`cluster.DBSCAN`), reduced memory - requirements, bug-fixes and better default settings. - -- Multinomial Logistic regression and a path algorithm in - :class:`linear_model.LogisticRegressionCV`. - -- Out-of core learning of PCA via :class:`decomposition.IncrementalPCA`. - -- Probability callibration of classifiers using - :class:`calibration.CalibratedClassifierCV`. - -- :class:`cluster.Birch` clustering method for large-scale datasets. - -- Scalable approximate nearest neighbors search with Locality-sensitive - hashing forests in :class:`neighbors.LSHForest`. - -- Improved error messages and better validation when using malformed input data. - -- More robust integration with pandas dataframes. - -Changelog ---------- - -New features -............ - -- The new :class:`neighbors.LSHForest` implements locality-sensitive hashing - for approximate nearest neighbors search. By :user:`Maheshakya Wijewardena`. - -- Added :class:`svm.LinearSVR`. This class uses the liblinear implementation - of Support Vector Regression which is much faster for large - sample sizes than :class:`svm.SVR` with linear kernel. By - `Fabian Pedregosa`_ and Qiang Luo. - -- Incremental fit for :class:`GaussianNB `. - -- Added ``sample_weight`` support to :class:`dummy.DummyClassifier` and - :class:`dummy.DummyRegressor`. By `Arnaud Joly`_. - -- Added the :func:`metrics.label_ranking_average_precision_score` metrics. - By `Arnaud Joly`_. - -- Add the :func:`metrics.coverage_error` metrics. By `Arnaud Joly`_. - -- Added :class:`linear_model.LogisticRegressionCV`. By - `Manoj Kumar`_, `Fabian Pedregosa`_, `Gael Varoquaux`_ - and `Alexandre Gramfort`_. - -- Added ``warm_start`` constructor parameter to make it possible for any - trained forest model to grow additional trees incrementally. By - :user:`Laurent Direr`. - -- Added ``sample_weight`` support to :class:`ensemble.GradientBoostingClassifier` and - :class:`ensemble.GradientBoostingRegressor`. By `Peter Prettenhofer`_. - -- Added :class:`decomposition.IncrementalPCA`, an implementation of the PCA - algorithm that supports out-of-core learning with a ``partial_fit`` - method. By `Kyle Kastner`_. - -- Averaged SGD for :class:`SGDClassifier ` - and :class:`SGDRegressor ` By - :user:`Danny Sullivan `. - -- Added :func:`cross_val_predict ` - function which computes cross-validated estimates. By `Luis Pedro Coelho`_ - -- Added :class:`linear_model.TheilSenRegressor`, a robust - generalized-median-based estimator. By :user:`Florian Wilhelm `. - -- Added :func:`metrics.median_absolute_error`, a robust metric. - By `Gael Varoquaux`_ and :user:`Florian Wilhelm `. - -- Add :class:`cluster.Birch`, an online clustering algorithm. By - `Manoj Kumar`_, `Alexandre Gramfort`_ and `Joel Nothman`_. - -- Added shrinkage support to :class:`discriminant_analysis.LinearDiscriminantAnalysis` - using two new solvers. By :user:`Clemens Brunner ` and `Martin Billinger`_. - -- Added :class:`kernel_ridge.KernelRidge`, an implementation of - kernelized ridge regression. - By `Mathieu Blondel`_ and `Jan Hendrik Metzen`_. - -- All solvers in :class:`linear_model.Ridge` now support `sample_weight`. - By `Mathieu Blondel`_. - -- Added :class:`cross_validation.PredefinedSplit` cross-validation - for fixed user-provided cross-validation folds. - By :user:`Thomas Unterthiner `. - -- Added :class:`calibration.CalibratedClassifierCV`, an approach for - calibrating the predicted probabilities of a classifier. - By `Alexandre Gramfort`_, `Jan Hendrik Metzen`_, `Mathieu Blondel`_ - and :user:`Balazs Kegl `. - - -Enhancements -............ - -- Add option ``return_distance`` in :func:`hierarchical.ward_tree` - to return distances between nodes for both structured and unstructured - versions of the algorithm. By `Matteo Visconti di Oleggio Castello`_. - The same option was added in :func:`hierarchical.linkage_tree`. - By `Manoj Kumar`_ - -- Add support for sample weights in scorer objects. Metrics with sample - weight support will automatically benefit from it. By `Noel Dawe`_ and - `Vlad Niculae`_. - -- Added ``newton-cg`` and `lbfgs` solver support in - :class:`linear_model.LogisticRegression`. By `Manoj Kumar`_. - -- Add ``selection="random"`` parameter to implement stochastic coordinate - descent for :class:`linear_model.Lasso`, :class:`linear_model.ElasticNet` - and related. By `Manoj Kumar`_. - -- Add ``sample_weight`` parameter to - :func:`metrics.jaccard_similarity_score` and :func:`metrics.log_loss`. - By :user:`Jatin Shah `. - -- Support sparse multilabel indicator representation in - :class:`preprocessing.LabelBinarizer` and - :class:`multiclass.OneVsRestClassifier` (by :user:`Hamzeh Alsalhi ` with thanks - to Rohit Sivaprasad), as well as evaluation metrics (by - `Joel Nothman`_). - -- Add ``sample_weight`` parameter to `metrics.jaccard_similarity_score`. - By `Jatin Shah`. - -- Add support for multiclass in `metrics.hinge_loss`. Added ``labels=None`` - as optional parameter. By `Saurabh Jha`. - -- Add ``sample_weight`` parameter to `metrics.hinge_loss`. - By `Saurabh Jha`. - -- Add ``multi_class="multinomial"`` option in - :class:`linear_model.LogisticRegression` to implement a Logistic - Regression solver that minimizes the cross-entropy or multinomial loss - instead of the default One-vs-Rest setting. Supports `lbfgs` and - `newton-cg` solvers. By `Lars Buitinck`_ and `Manoj Kumar`_. Solver option - `newton-cg` by Simon Wu. - -- ``DictVectorizer`` can now perform ``fit_transform`` on an iterable in a - single pass, when giving the option ``sort=False``. By :user:`Dan - Blanchard `. - -- :class:`GridSearchCV` and :class:`RandomizedSearchCV` can now be - configured to work with estimators that may fail and raise errors on - individual folds. This option is controlled by the `error_score` - parameter. This does not affect errors raised on re-fit. By - :user:`Michal Romaniuk `. - -- Add ``digits`` parameter to `metrics.classification_report` to allow - report to show different precision of floating point numbers. By - :user:`Ian Gilmore `. - -- Add a quantile prediction strategy to the :class:`dummy.DummyRegressor`. - By :user:`Aaron Staple `. - -- Add ``handle_unknown`` option to :class:`preprocessing.OneHotEncoder` to - handle unknown categorical features more gracefully during transform. - By `Manoj Kumar`_. - -- Added support for sparse input data to decision trees and their ensembles. - By `Fares Hedyati`_ and `Arnaud Joly`_. - -- Optimized :class:`cluster.AffinityPropagation` by reducing the number of - memory allocations of large temporary data-structures. By `Antony Lee`_. - -- Parellization of the computation of feature importances in random forest. - By `Olivier Grisel`_ and `Arnaud Joly`_. - -- Add ``n_iter_`` attribute to estimators that accept a ``max_iter`` attribute - in their constructor. By `Manoj Kumar`_. - -- Added decision function for :class:`multiclass.OneVsOneClassifier` - By `Raghav RV`_ and :user:`Kyle Beauchamp `. - -- :func:`neighbors.kneighbors_graph` and :func:`radius_neighbors_graph` - support non-Euclidean metrics. By `Manoj Kumar`_ - -- Parameter ``connectivity`` in :class:`cluster.AgglomerativeClustering` - and family now accept callables that return a connectivity matrix. - By `Manoj Kumar`_. - -- Sparse support for :func:`paired_distances`. By `Joel Nothman`_. - -- :class:`cluster.DBSCAN` now supports sparse input and sample weights and - has been optimized: the inner loop has been rewritten in Cython and - radius neighbors queries are now computed in batch. By `Joel Nothman`_ - and `Lars Buitinck`_. - -- Add ``class_weight`` parameter to automatically weight samples by class - frequency for :class:`ensemble.RandomForestClassifier`, - :class:`tree.DecisionTreeClassifier`, :class:`ensemble.ExtraTreesClassifier` - and :class:`tree.ExtraTreeClassifier`. By `Trevor Stephens`_. - -- :class:`grid_search.RandomizedSearchCV` now does sampling without - replacement if all parameters are given as lists. By `Andreas Müller`_. - -- Parallelized calculation of :func:`pairwise_distances` is now supported - for scipy metrics and custom callables. By `Joel Nothman`_. - -- Allow the fitting and scoring of all clustering algorithms in - :class:`pipeline.Pipeline`. By `Andreas Müller`_. - -- More robust seeding and improved error messages in :class:`cluster.MeanShift` - by `Andreas Müller`_. - -- Make the stopping criterion for :class:`mixture.GMM`, - :class:`mixture.DPGMM` and :class:`mixture.VBGMM` less dependent on the - number of samples by thresholding the average log-likelihood change - instead of its sum over all samples. By `Hervé Bredin`_. - -- The outcome of :func:`manifold.spectral_embedding` was made deterministic - by flipping the sign of eigenvectors. By :user:`Hasil Sharma `. - -- Significant performance and memory usage improvements in - :class:`preprocessing.PolynomialFeatures`. By `Eric Martin`_. - -- Numerical stability improvements for :class:`preprocessing.StandardScaler` - and :func:`preprocessing.scale`. By `Nicolas Goix`_ - -- :class:`svm.SVC` fitted on sparse input now implements ``decision_function``. - By `Rob Zinkov`_ and `Andreas Müller`_. - -- :func:`cross_validation.train_test_split` now preserves the input type, - instead of converting to numpy arrays. - - -Documentation improvements -.......................... - -- Added example of using :class:`FeatureUnion` for heterogeneous input. - By :user:`Matt Terry ` - -- Documentation on scorers was improved, to highlight the handling of loss - functions. By :user:`Matt Pico `. - -- A discrepancy between liblinear output and scikit-learn's wrappers - is now noted. By `Manoj Kumar`_. - -- Improved documentation generation: examples referring to a class or - function are now shown in a gallery on the class/function's API reference - page. By `Joel Nothman`_. - -- More explicit documentation of sample generators and of data - transformation. By `Joel Nothman`_. - -- :class:`sklearn.neighbors.BallTree` and :class:`sklearn.neighbors.KDTree` - used to point to empty pages stating that they are aliases of BinaryTree. - This has been fixed to show the correct class docs. By `Manoj Kumar`_. - -- Added silhouette plots for analysis of KMeans clustering using - :func:`metrics.silhouette_samples` and :func:`metrics.silhouette_score`. - See :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_silhouette_analysis.py` - -Bug fixes -......... -- Metaestimators now support ducktyping for the presence of ``decision_function``, - ``predict_proba`` and other methods. This fixes behavior of - :class:`grid_search.GridSearchCV`, - :class:`grid_search.RandomizedSearchCV`, :class:`pipeline.Pipeline`, - :class:`feature_selection.RFE`, :class:`feature_selection.RFECV` when nested. - By `Joel Nothman`_ - -- The ``scoring`` attribute of grid-search and cross-validation methods is no longer - ignored when a :class:`grid_search.GridSearchCV` is given as a base estimator or - the base estimator doesn't have predict. - -- The function :func:`hierarchical.ward_tree` now returns the children in - the same order for both the structured and unstructured versions. By - `Matteo Visconti di Oleggio Castello`_. - -- :class:`feature_selection.RFECV` now correctly handles cases when - ``step`` is not equal to 1. By :user:`Nikolay Mayorov ` - -- The :class:`decomposition.PCA` now undoes whitening in its - ``inverse_transform``. Also, its ``components_`` now always have unit - length. By :user:`Michael Eickenberg `. - -- Fix incomplete download of the dataset when - :func:`datasets.download_20newsgroups` is called. By `Manoj Kumar`_. - -- Various fixes to the Gaussian processes subpackage by Vincent Dubourg - and Jan Hendrik Metzen. - -- Calling ``partial_fit`` with ``class_weight=='auto'`` throws an - appropriate error message and suggests a work around. - By :user:`Danny Sullivan `. - -- :class:`RBFSampler ` with ``gamma=g`` - formerly approximated :func:`rbf_kernel ` - with ``gamma=g/2.``; the definition of ``gamma`` is now consistent, - which may substantially change your results if you use a fixed value. - (If you cross-validated over ``gamma``, it probably doesn't matter - too much.) By :user:`Dougal Sutherland `. - -- Pipeline object delegate the ``classes_`` attribute to the underlying - estimator. It allows, for instance, to make bagging of a pipeline object. - By `Arnaud Joly`_ - -- :class:`neighbors.NearestCentroid` now uses the median as the centroid - when metric is set to ``manhattan``. It was using the mean before. - By `Manoj Kumar`_ - -- Fix numerical stability issues in :class:`linear_model.SGDClassifier` - and :class:`linear_model.SGDRegressor` by clipping large gradients and - ensuring that weight decay rescaling is always positive (for large - l2 regularization and large learning rate values). - By `Olivier Grisel`_ - -- When `compute_full_tree` is set to "auto", the full tree is - built when n_clusters is high and is early stopped when n_clusters is - low, while the behavior should be vice-versa in - :class:`cluster.AgglomerativeClustering` (and friends). - This has been fixed By `Manoj Kumar`_ - -- Fix lazy centering of data in :func:`linear_model.enet_path` and - :func:`linear_model.lasso_path`. It was centered around one. It has - been changed to be centered around the origin. By `Manoj Kumar`_ - -- Fix handling of precomputed affinity matrices in - :class:`cluster.AgglomerativeClustering` when using connectivity - constraints. By :user:`Cathy Deng ` - -- Correct ``partial_fit`` handling of ``class_prior`` for - :class:`sklearn.naive_bayes.MultinomialNB` and - :class:`sklearn.naive_bayes.BernoulliNB`. By `Trevor Stephens`_. - -- Fixed a crash in :func:`metrics.precision_recall_fscore_support` - when using unsorted ``labels`` in the multi-label setting. - By `Andreas Müller`_. - -- Avoid skipping the first nearest neighbor in the methods ``radius_neighbors``, - ``kneighbors``, ``kneighbors_graph`` and ``radius_neighbors_graph`` in - :class:`sklearn.neighbors.NearestNeighbors` and family, when the query - data is not the same as fit data. By `Manoj Kumar`_. - -- Fix log-density calculation in the :class:`mixture.GMM` with - tied covariance. By `Will Dawson`_ - -- Fixed a scaling error in :class:`feature_selection.SelectFdr` - where a factor ``n_features`` was missing. By `Andrew Tulloch`_ - -- Fix zero division in :class:`neighbors.KNeighborsRegressor` and related - classes when using distance weighting and having identical data points. - By `Garret-R `_. - -- Fixed round off errors with non positive-definite covariance matrices - in GMM. By :user:`Alexis Mignon `. - -- Fixed a error in the computation of conditional probabilities in - :class:`naive_bayes.BernoulliNB`. By `Hanna Wallach`_. - -- Make the method ``radius_neighbors`` of - :class:`neighbors.NearestNeighbors` return the samples lying on the - boundary for ``algorithm='brute'``. By `Yan Yi`_. - -- Flip sign of ``dual_coef_`` of :class:`svm.SVC` - to make it consistent with the documentation and - ``decision_function``. By Artem Sobolev. - -- Fixed handling of ties in :class:`isotonic.IsotonicRegression`. - We now use the weighted average of targets (secondary method). By - `Andreas Müller`_ and `Michael Bommarito `_. - -API changes summary -------------------- - -- :class:`GridSearchCV ` and - :func:`cross_val_score ` and other - meta-estimators don't convert pandas DataFrames into arrays any more, - allowing DataFrame specific operations in custom estimators. - -- :func:`multiclass.fit_ovr`, :func:`multiclass.predict_ovr`, - :func:`predict_proba_ovr`, - :func:`multiclass.fit_ovo`, :func:`multiclass.predict_ovo`, - :func:`multiclass.fit_ecoc` and :func:`multiclass.predict_ecoc` - are deprecated. Use the underlying estimators instead. - -- Nearest neighbors estimators used to take arbitrary keyword arguments - and pass these to their distance metric. This will no longer be supported - in scikit-learn 0.18; use the ``metric_params`` argument instead. - -- `n_jobs` parameter of the fit method shifted to the constructor of the - LinearRegression class. - -- The ``predict_proba`` method of :class:`multiclass.OneVsRestClassifier` - now returns two probabilities per sample in the multiclass case; this - is consistent with other estimators and with the method's documentation, - but previous versions accidentally returned only the positive - probability. Fixed by Will Lamond and `Lars Buitinck`_. - -- Change default value of precompute in :class:`ElasticNet` and :class:`Lasso` - to False. Setting precompute to "auto" was found to be slower when - n_samples > n_features since the computation of the Gram matrix is - computationally expensive and outweighs the benefit of fitting the Gram - for just one alpha. - ``precompute="auto"`` is now deprecated and will be removed in 0.18 - By `Manoj Kumar`_. - -- Expose ``positive`` option in :func:`linear_model.enet_path` and - :func:`linear_model.enet_path` which constrains coefficients to be - positive. By `Manoj Kumar`_. - -- Users should now supply an explicit ``average`` parameter to - :func:`sklearn.metrics.f1_score`, :func:`sklearn.metrics.fbeta_score`, - :func:`sklearn.metrics.recall_score` and - :func:`sklearn.metrics.precision_score` when performing multiclass - or multilabel (i.e. not binary) classification. By `Joel Nothman`_. - -- `scoring` parameter for cross validation now accepts `'f1_micro'`, - `'f1_macro'` or `'f1_weighted'`. `'f1'` is now for binary classification - only. Similar changes apply to `'precision'` and `'recall'`. - By `Joel Nothman`_. - -- The ``fit_intercept``, ``normalize`` and ``return_models`` parameters in - :func:`linear_model.enet_path` and :func:`linear_model.lasso_path` have - been removed. They were deprecated since 0.14 - -- From now onwards, all estimators will uniformly raise ``NotFittedError`` - (:class:`utils.validation.NotFittedError`), when any of the ``predict`` - like methods are called before the model is fit. By `Raghav RV`_. - -- Input data validation was refactored for more consistent input - validation. The ``check_arrays`` function was replaced by ``check_array`` - and ``check_X_y``. By `Andreas Müller`_. - -- Allow ``X=None`` in the methods ``radius_neighbors``, ``kneighbors``, - ``kneighbors_graph`` and ``radius_neighbors_graph`` in - :class:`sklearn.neighbors.NearestNeighbors` and family. If set to None, - then for every sample this avoids setting the sample itself as the - first nearest neighbor. By `Manoj Kumar`_. - -- Add parameter ``include_self`` in :func:`neighbors.kneighbors_graph` - and :func:`neighbors.radius_neighbors_graph` which has to be explicitly - set by the user. If set to True, then the sample itself is considered - as the first nearest neighbor. - -- `thresh` parameter is deprecated in favor of new `tol` parameter in - :class:`GMM`, :class:`DPGMM` and :class:`VBGMM`. See `Enhancements` - section for details. By `Hervé Bredin`_. - -- Estimators will treat input with dtype object as numeric when possible. - By `Andreas Müller`_ - -- Estimators now raise `ValueError` consistently when fitted on empty - data (less than 1 sample or less than 1 feature for 2D input). - By `Olivier Grisel`_. - - -- The ``shuffle`` option of :class:`.linear_model.SGDClassifier`, - :class:`linear_model.SGDRegressor`, :class:`linear_model.Perceptron`, - :class:`linear_model.PassiveAgressiveClassifier` and - :class:`linear_model.PassiveAgressiveRegressor` now defaults to ``True``. - -- :class:`cluster.DBSCAN` now uses a deterministic initialization. The - `random_state` parameter is deprecated. By :user:`Erich Schubert `. - -Code Contributors ------------------ -A. Flaxman, Aaron Schumacher, Aaron Staple, abhishek thakur, Akshay, akshayah3, -Aldrian Obaja, Alexander Fabisch, Alexandre Gramfort, Alexis Mignon, Anders -Aagaard, Andreas Mueller, Andreas van Cranenburgh, Andrew Tulloch, Andrew -Walker, Antony Lee, Arnaud Joly, banilo, Barmaley.exe, Ben Davies, Benedikt -Koehler, bhsu, Boris Feld, Borja Ayerdi, Boyuan Deng, Brent Pedersen, Brian -Wignall, Brooke Osborn, Calvin Giles, Cathy Deng, Celeo, cgohlke, chebee7i, -Christian Stade-Schuldt, Christof Angermueller, Chyi-Kwei Yau, CJ Carey, -Clemens Brunner, Daiki Aminaka, Dan Blanchard, danfrankj, Danny Sullivan, David -Fletcher, Dmitrijs Milajevs, Dougal J. Sutherland, Erich Schubert, Fabian -Pedregosa, Florian Wilhelm, floydsoft, Félix-Antoine Fortin, Gael Varoquaux, -Garrett-R, Gilles Louppe, gpassino, gwulfs, Hampus Bengtsson, Hamzeh Alsalhi, -Hanna Wallach, Harry Mavroforakis, Hasil Sharma, Helder, Herve Bredin, -Hsiang-Fu Yu, Hugues SALAMIN, Ian Gilmore, Ilambharathi Kanniah, Imran Haque, -isms, Jake VanderPlas, Jan Dlabal, Jan Hendrik Metzen, Jatin Shah, Javier López -Peña, jdcaballero, Jean Kossaifi, Jeff Hammerbacher, Joel Nothman, Jonathan -Helmus, Joseph, Kaicheng Zhang, Kevin Markham, Kyle Beauchamp, Kyle Kastner, -Lagacherie Matthieu, Lars Buitinck, Laurent Direr, leepei, Loic Esteve, Luis -Pedro Coelho, Lukas Michelbacher, maheshakya, Manoj Kumar, Manuel, Mario -Michael Krell, Martin, Martin Billinger, Martin Ku, Mateusz Susik, Mathieu -Blondel, Matt Pico, Matt Terry, Matteo Visconti dOC, Matti Lyra, Max Linke, -Mehdi Cherti, Michael Bommarito, Michael Eickenberg, Michal Romaniuk, MLG, -mr.Shu, Nelle Varoquaux, Nicola Montecchio, Nicolas, Nikolay Mayorov, Noel -Dawe, Okal Billy, Olivier Grisel, Óscar Nájera, Paolo Puggioni, Peter -Prettenhofer, Pratap Vardhan, pvnguyen, queqichao, Rafael Carrascosa, Raghav R -V, Rahiel Kasim, Randall Mason, Rob Zinkov, Robert Bradshaw, Saket Choudhary, -Sam Nicholls, Samuel Charron, Saurabh Jha, sethdandridge, sinhrks, snuderl, -Stefan Otte, Stefan van der Walt, Steve Tjoa, swu, Sylvain Zimmer, tejesh95, -terrycojones, Thomas Delteil, Thomas Unterthiner, Tomas Kazmar, trevorstephens, -tttthomasssss, Tzu-Ming Kuo, ugurcaliskan, ugurthemaster, Vinayak Mehta, -Vincent Dubourg, Vjacheslav Murashkin, Vlad Niculae, wadawson, Wei Xue, Will -Lamond, Wu Jiang, x0l, Xinfan Meng, Yan Yi, Yu-Chin - -.. _changes_0_15_2: - -Version 0.15.2 -============== - -**September 4, 2014** - -Bug fixes ---------- - -- Fixed handling of the ``p`` parameter of the Minkowski distance that was - previously ignored in nearest neighbors models. By :user:`Nikolay - Mayorov `. - -- Fixed duplicated alphas in :class:`linear_model.LassoLars` with early - stopping on 32 bit Python. By `Olivier Grisel`_ and `Fabian Pedregosa`_. - -- Fixed the build under Windows when scikit-learn is built with MSVC while - NumPy is built with MinGW. By `Olivier Grisel`_ and :user:`Federico - Vaggi `. - -- Fixed an array index overflow bug in the coordinate descent solver. By - `Gael Varoquaux`_. - -- Better handling of numpy 1.9 deprecation warnings. By `Gael Varoquaux`_. - -- Removed unnecessary data copy in :class:`cluster.KMeans`. - By `Gael Varoquaux`_. - -- Explicitly close open files to avoid ``ResourceWarnings`` under Python 3. - By Calvin Giles. - -- The ``transform`` of :class:`discriminant_analysis.LinearDiscriminantAnalysis` - now projects the input on the most discriminant directions. By Martin Billinger. - -- Fixed potential overflow in ``_tree.safe_realloc`` by `Lars Buitinck`_. - -- Performance optimization in :class:`isotonic.IsotonicRegression`. - By Robert Bradshaw. - -- ``nose`` is non-longer a runtime dependency to import ``sklearn``, only for - running the tests. By `Joel Nothman`_. - -- Many documentation and website fixes by `Joel Nothman`_, `Lars Buitinck`_ - :user:`Matt Pico `, and others. - -.. _changes_0_15_1: - -Version 0.15.1 -============== - -**August 1, 2014** - -Bug fixes ---------- - -- Made :func:`cross_validation.cross_val_score` use - :class:`cross_validation.KFold` instead of - :class:`cross_validation.StratifiedKFold` on multi-output classification - problems. By :user:`Nikolay Mayorov `. - -- Support unseen labels :class:`preprocessing.LabelBinarizer` to restore - the default behavior of 0.14.1 for backward compatibility. By - :user:`Hamzeh Alsalhi `. - -- Fixed the :class:`cluster.KMeans` stopping criterion that prevented early - convergence detection. By Edward Raff and `Gael Varoquaux`_. - -- Fixed the behavior of :class:`multiclass.OneVsOneClassifier`. - in case of ties at the per-class vote level by computing the correct - per-class sum of prediction scores. By `Andreas Müller`_. - -- Made :func:`cross_validation.cross_val_score` and - :class:`grid_search.GridSearchCV` accept Python lists as input data. - This is especially useful for cross-validation and model selection of - text processing pipelines. By `Andreas Müller`_. - -- Fixed data input checks of most estimators to accept input data that - implements the NumPy ``__array__`` protocol. This is the case for - for ``pandas.Series`` and ``pandas.DataFrame`` in recent versions of - pandas. By `Gael Varoquaux`_. - -- Fixed a regression for :class:`linear_model.SGDClassifier` with - ``class_weight="auto"`` on data with non-contiguous labels. By - `Olivier Grisel`_. - - -.. _changes_0_15: - -Version 0.15 -============ - -**July 15, 2014** - -Highlights ------------ - -- Many speed and memory improvements all across the code - -- Huge speed and memory improvements to random forests (and extra - trees) that also benefit better from parallel computing. - -- Incremental fit to :class:`BernoulliRBM ` - -- Added :class:`cluster.AgglomerativeClustering` for hierarchical - agglomerative clustering with average linkage, complete linkage and - ward strategies. - -- Added :class:`linear_model.RANSACRegressor` for robust regression - models. - -- Added dimensionality reduction with :class:`manifold.TSNE` which can be - used to visualize high-dimensional data. - - -Changelog ---------- - -New features -............ - -- Added :class:`ensemble.BaggingClassifier` and - :class:`ensemble.BaggingRegressor` meta-estimators for ensembling - any kind of base estimator. See the :ref:`Bagging ` section of - the user guide for details and examples. By `Gilles Louppe`_. - -- New unsupervised feature selection algorithm - :class:`feature_selection.VarianceThreshold`, by `Lars Buitinck`_. - -- Added :class:`linear_model.RANSACRegressor` meta-estimator for the robust - fitting of regression models. By :user:`Johannes Schönberger `. - -- Added :class:`cluster.AgglomerativeClustering` for hierarchical - agglomerative clustering with average linkage, complete linkage and - ward strategies, by `Nelle Varoquaux`_ and `Gael Varoquaux`_. - -- Shorthand constructors :func:`pipeline.make_pipeline` and - :func:`pipeline.make_union` were added by `Lars Buitinck`_. - -- Shuffle option for :class:`cross_validation.StratifiedKFold`. - By :user:`Jeffrey Blackburne `. - -- Incremental learning (``partial_fit``) for Gaussian Naive Bayes by - Imran Haque. - -- Added ``partial_fit`` to :class:`BernoulliRBM - ` - By :user:`Danny Sullivan `. - -- Added :func:`learning_curve ` utility to - chart performance with respect to training size. See - :ref:`sphx_glr_auto_examples_model_selection_plot_learning_curve.py`. By Alexander Fabisch. - -- Add positive option in :class:`LassoCV ` and - :class:`ElasticNetCV `. - By Brian Wignall and `Alexandre Gramfort`_. - -- Added :class:`linear_model.MultiTaskElasticNetCV` and - :class:`linear_model.MultiTaskLassoCV`. By `Manoj Kumar`_. - -- Added :class:`manifold.TSNE`. By Alexander Fabisch. - -Enhancements -............ - -- Add sparse input support to :class:`ensemble.AdaBoostClassifier` and - :class:`ensemble.AdaBoostRegressor` meta-estimators. - By :user:`Hamzeh Alsalhi `. - -- Memory improvements of decision trees, by `Arnaud Joly`_. - -- Decision trees can now be built in best-first manner by using ``max_leaf_nodes`` - as the stopping criteria. Refactored the tree code to use either a - stack or a priority queue for tree building. - By `Peter Prettenhofer`_ and `Gilles Louppe`_. - -- Decision trees can now be fitted on fortran- and c-style arrays, and - non-continuous arrays without the need to make a copy. - If the input array has a different dtype than ``np.float32``, a fortran- - style copy will be made since fortran-style memory layout has speed - advantages. By `Peter Prettenhofer`_ and `Gilles Louppe`_. - -- Speed improvement of regression trees by optimizing the - the computation of the mean square error criterion. This lead - to speed improvement of the tree, forest and gradient boosting tree - modules. By `Arnaud Joly`_ - -- The ``img_to_graph`` and ``grid_tograph`` functions in - :mod:`sklearn.feature_extraction.image` now return ``np.ndarray`` - instead of ``np.matrix`` when ``return_as=np.ndarray``. See the - Notes section for more information on compatibility. - -- Changed the internal storage of decision trees to use a struct array. - This fixed some small bugs, while improving code and providing a small - speed gain. By `Joel Nothman`_. - -- Reduce memory usage and overhead when fitting and predicting with forests - of randomized trees in parallel with ``n_jobs != 1`` by leveraging new - threading backend of joblib 0.8 and releasing the GIL in the tree fitting - Cython code. By `Olivier Grisel`_ and `Gilles Louppe`_. - -- Speed improvement of the :mod:`sklearn.ensemble.gradient_boosting` module. - By `Gilles Louppe`_ and `Peter Prettenhofer`_. - -- Various enhancements to the :mod:`sklearn.ensemble.gradient_boosting` - module: a ``warm_start`` argument to fit additional trees, - a ``max_leaf_nodes`` argument to fit GBM style trees, - a ``monitor`` fit argument to inspect the estimator during training, and - refactoring of the verbose code. By `Peter Prettenhofer`_. - -- Faster :class:`sklearn.ensemble.ExtraTrees` by caching feature values. - By `Arnaud Joly`_. - -- Faster depth-based tree building algorithm such as decision tree, - random forest, extra trees or gradient tree boosting (with depth based - growing strategy) by avoiding trying to split on found constant features - in the sample subset. By `Arnaud Joly`_. - -- Add ``min_weight_fraction_leaf`` pre-pruning parameter to tree-based - methods: the minimum weighted fraction of the input samples required to be - at a leaf node. By `Noel Dawe`_. - -- Added :func:`metrics.pairwise_distances_argmin_min`, by Philippe Gervais. - -- Added predict method to :class:`cluster.AffinityPropagation` and - :class:`cluster.MeanShift`, by `Mathieu Blondel`_. - -- Vector and matrix multiplications have been optimised throughout the - library by `Denis Engemann`_, and `Alexandre Gramfort`_. - In particular, they should take less memory with older NumPy versions - (prior to 1.7.2). - -- Precision-recall and ROC examples now use train_test_split, and have more - explanation of why these metrics are useful. By `Kyle Kastner`_ - -- The training algorithm for :class:`decomposition.NMF` is faster for - sparse matrices and has much lower memory complexity, meaning it will - scale up gracefully to large datasets. By `Lars Buitinck`_. - -- Added svd_method option with default value to "randomized" to - :class:`decomposition.FactorAnalysis` to save memory and - significantly speedup computation by `Denis Engemann`_, and - `Alexandre Gramfort`_. - -- Changed :class:`cross_validation.StratifiedKFold` to try and - preserve as much of the original ordering of samples as possible so as - not to hide overfitting on datasets with a non-negligible level of - samples dependency. - By `Daniel Nouri`_ and `Olivier Grisel`_. - -- Add multi-output support to :class:`gaussian_process.GaussianProcess` - by John Novak. - -- Support for precomputed distance matrices in nearest neighbor estimators - by `Robert Layton`_ and `Joel Nothman`_. - -- Norm computations optimized for NumPy 1.6 and later versions by - `Lars Buitinck`_. In particular, the k-means algorithm no longer - needs a temporary data structure the size of its input. - -- :class:`dummy.DummyClassifier` can now be used to predict a constant - output value. By `Manoj Kumar`_. - -- :class:`dummy.DummyRegressor` has now a strategy parameter which allows - to predict the mean, the median of the training set or a constant - output value. By :user:`Maheshakya Wijewardena `. - -- Multi-label classification output in multilabel indicator format - is now supported by :func:`metrics.roc_auc_score` and - :func:`metrics.average_precision_score` by `Arnaud Joly`_. - -- Significant performance improvements (more than 100x speedup for - large problems) in :class:`isotonic.IsotonicRegression` by - `Andrew Tulloch`_. - -- Speed and memory usage improvements to the SGD algorithm for linear - models: it now uses threads, not separate processes, when ``n_jobs>1``. - By `Lars Buitinck`_. - -- Grid search and cross validation allow NaNs in the input arrays so that - preprocessors such as :class:`preprocessing.Imputer - ` can be trained within the cross validation loop, - avoiding potentially skewed results. - -- Ridge regression can now deal with sample weights in feature space - (only sample space until then). By :user:`Michael Eickenberg `. - Both solutions are provided by the Cholesky solver. - -- Several classification and regression metrics now support weighted - samples with the new ``sample_weight`` argument: - :func:`metrics.accuracy_score`, - :func:`metrics.zero_one_loss`, - :func:`metrics.precision_score`, - :func:`metrics.average_precision_score`, - :func:`metrics.f1_score`, - :func:`metrics.fbeta_score`, - :func:`metrics.recall_score`, - :func:`metrics.roc_auc_score`, - :func:`metrics.explained_variance_score`, - :func:`metrics.mean_squared_error`, - :func:`metrics.mean_absolute_error`, - :func:`metrics.r2_score`. - By `Noel Dawe`_. - -- Speed up of the sample generator - :func:`datasets.make_multilabel_classification`. By `Joel Nothman`_. - -Documentation improvements -........................... - -- The :ref:`Working With Text Data ` tutorial - has now been worked in to the main documentation's tutorial section. - Includes exercises and skeletons for tutorial presentation. - Original tutorial created by several authors including - `Olivier Grisel`_, Lars Buitinck and many others. - Tutorial integration into the scikit-learn documentation - by `Jaques Grobler`_ - -- Added :ref:`Computational Performance ` - documentation. Discussion and examples of prediction latency / throughput - and different factors that have influence over speed. Additional tips for - building faster models and choosing a relevant compromise between speed - and predictive power. - By :user:`Eustache Diemert `. - -Bug fixes -......... - -- Fixed bug in :class:`decomposition.MiniBatchDictionaryLearning` : - ``partial_fit`` was not working properly. - -- Fixed bug in :class:`linear_model.stochastic_gradient` : - ``l1_ratio`` was used as ``(1.0 - l1_ratio)`` . - -- Fixed bug in :class:`multiclass.OneVsOneClassifier` with string - labels - -- Fixed a bug in :class:`LassoCV ` and - :class:`ElasticNetCV `: they would not - pre-compute the Gram matrix with ``precompute=True`` or - ``precompute="auto"`` and ``n_samples > n_features``. By `Manoj Kumar`_. - -- Fixed incorrect estimation of the degrees of freedom in - :func:`feature_selection.f_regression` when variates are not centered. - By :user:`Virgile Fritsch `. - -- Fixed a race condition in parallel processing with - ``pre_dispatch != "all"`` (for instance, in ``cross_val_score``). - By `Olivier Grisel`_. - -- Raise error in :class:`cluster.FeatureAgglomeration` and - :class:`cluster.WardAgglomeration` when no samples are given, - rather than returning meaningless clustering. - -- Fixed bug in :class:`gradient_boosting.GradientBoostingRegressor` with - ``loss='huber'``: ``gamma`` might have not been initialized. - -- Fixed feature importances as computed with a forest of randomized trees - when fit with ``sample_weight != None`` and/or with ``bootstrap=True``. - By `Gilles Louppe`_. - -API changes summary -------------------- - -- :mod:`sklearn.hmm` is deprecated. Its removal is planned - for the 0.17 release. - -- Use of :class:`covariance.EllipticEnvelop` has now been removed after - deprecation. - Please use :class:`covariance.EllipticEnvelope` instead. - -- :class:`cluster.Ward` is deprecated. Use - :class:`cluster.AgglomerativeClustering` instead. - -- :class:`cluster.WardClustering` is deprecated. Use -- :class:`cluster.AgglomerativeClustering` instead. - -- :class:`cross_validation.Bootstrap` is deprecated. - :class:`cross_validation.KFold` or - :class:`cross_validation.ShuffleSplit` are recommended instead. - -- Direct support for the sequence of sequences (or list of lists) multilabel - format is deprecated. To convert to and from the supported binary - indicator matrix format, use - :class:`MultiLabelBinarizer `. - By `Joel Nothman`_. - -- Add score method to :class:`PCA ` following the model of - probabilistic PCA and deprecate - :class:`ProbabilisticPCA ` model whose - score implementation is not correct. The computation now also exploits the - matrix inversion lemma for faster computation. By `Alexandre Gramfort`_. - -- The score method of :class:`FactorAnalysis ` - now returns the average log-likelihood of the samples. Use score_samples - to get log-likelihood of each sample. By `Alexandre Gramfort`_. - -- Generating boolean masks (the setting ``indices=False``) - from cross-validation generators is deprecated. - Support for masks will be removed in 0.17. - The generators have produced arrays of indices by default since 0.10. - By `Joel Nothman`_. - -- 1-d arrays containing strings with ``dtype=object`` (as used in Pandas) - are now considered valid classification targets. This fixes a regression - from version 0.13 in some classifiers. By `Joel Nothman`_. - -- Fix wrong ``explained_variance_ratio_`` attribute in - :class:`RandomizedPCA `. - By `Alexandre Gramfort`_. - -- Fit alphas for each ``l1_ratio`` instead of ``mean_l1_ratio`` in - :class:`linear_model.ElasticNetCV` and :class:`linear_model.LassoCV`. - This changes the shape of ``alphas_`` from ``(n_alphas,)`` to - ``(n_l1_ratio, n_alphas)`` if the ``l1_ratio`` provided is a 1-D array like - object of length greater than one. - By `Manoj Kumar`_. - -- Fix :class:`linear_model.ElasticNetCV` and :class:`linear_model.LassoCV` - when fitting intercept and input data is sparse. The automatic grid - of alphas was not computed correctly and the scaling with normalize - was wrong. By `Manoj Kumar`_. - -- Fix wrong maximal number of features drawn (``max_features``) at each split - for decision trees, random forests and gradient tree boosting. - Previously, the count for the number of drawn features started only after - one non constant features in the split. This bug fix will affect - computational and generalization performance of those algorithms in the - presence of constant features. To get back previous generalization - performance, you should modify the value of ``max_features``. - By `Arnaud Joly`_. - -- Fix wrong maximal number of features drawn (``max_features``) at each split - for :class:`ensemble.ExtraTreesClassifier` and - :class:`ensemble.ExtraTreesRegressor`. Previously, only non constant - features in the split was counted as drawn. Now constant features are - counted as drawn. Furthermore at least one feature must be non constant - in order to make a valid split. This bug fix will affect - computational and generalization performance of extra trees in the - presence of constant features. To get back previous generalization - performance, you should modify the value of ``max_features``. - By `Arnaud Joly`_. - -- Fix :func:`utils.compute_class_weight` when ``class_weight=="auto"``. - Previously it was broken for input of non-integer ``dtype`` and the - weighted array that was returned was wrong. By `Manoj Kumar`_. - -- Fix :class:`cross_validation.Bootstrap` to return ``ValueError`` - when ``n_train + n_test > n``. By :user:`Ronald Phlypo `. - - -People ------- - -List of contributors for release 0.15 by number of commits. - -* 312 Olivier Grisel -* 275 Lars Buitinck -* 221 Gael Varoquaux -* 148 Arnaud Joly -* 134 Johannes Schönberger -* 119 Gilles Louppe -* 113 Joel Nothman -* 111 Alexandre Gramfort -* 95 Jaques Grobler -* 89 Denis Engemann -* 83 Peter Prettenhofer -* 83 Alexander Fabisch -* 62 Mathieu Blondel -* 60 Eustache Diemert -* 60 Nelle Varoquaux -* 49 Michael Bommarito -* 45 Manoj-Kumar-S -* 28 Kyle Kastner -* 26 Andreas Mueller -* 22 Noel Dawe -* 21 Maheshakya Wijewardena -* 21 Brooke Osborn -* 21 Hamzeh Alsalhi -* 21 Jake VanderPlas -* 21 Philippe Gervais -* 19 Bala Subrahmanyam Varanasi -* 12 Ronald Phlypo -* 10 Mikhail Korobov -* 8 Thomas Unterthiner -* 8 Jeffrey Blackburne -* 8 eltermann -* 8 bwignall -* 7 Ankit Agrawal -* 7 CJ Carey -* 6 Daniel Nouri -* 6 Chen Liu -* 6 Michael Eickenberg -* 6 ugurthemaster -* 5 Aaron Schumacher -* 5 Baptiste Lagarde -* 5 Rajat Khanduja -* 5 Robert McGibbon -* 5 Sergio Pascual -* 4 Alexis Metaireau -* 4 Ignacio Rossi -* 4 Virgile Fritsch -* 4 Sebastian Säger -* 4 Ilambharathi Kanniah -* 4 sdenton4 -* 4 Robert Layton -* 4 Alyssa -* 4 Amos Waterland -* 3 Andrew Tulloch -* 3 murad -* 3 Steven Maude -* 3 Karol Pysniak -* 3 Jacques Kvam -* 3 cgohlke -* 3 cjlin -* 3 Michael Becker -* 3 hamzeh -* 3 Eric Jacobsen -* 3 john collins -* 3 kaushik94 -* 3 Erwin Marsi -* 2 csytracy -* 2 LK -* 2 Vlad Niculae -* 2 Laurent Direr -* 2 Erik Shilts -* 2 Raul Garreta -* 2 Yoshiki Vázquez Baeza -* 2 Yung Siang Liau -* 2 abhishek thakur -* 2 James Yu -* 2 Rohit Sivaprasad -* 2 Roland Szabo -* 2 amormachine -* 2 Alexis Mignon -* 2 Oscar Carlsson -* 2 Nantas Nardelli -* 2 jess010 -* 2 kowalski87 -* 2 Andrew Clegg -* 2 Federico Vaggi -* 2 Simon Frid -* 2 Félix-Antoine Fortin -* 1 Ralf Gommers -* 1 t-aft -* 1 Ronan Amicel -* 1 Rupesh Kumar Srivastava -* 1 Ryan Wang -* 1 Samuel Charron -* 1 Samuel St-Jean -* 1 Fabian Pedregosa -* 1 Skipper Seabold -* 1 Stefan Walk -* 1 Stefan van der Walt -* 1 Stephan Hoyer -* 1 Allen Riddell -* 1 Valentin Haenel -* 1 Vijay Ramesh -* 1 Will Myers -* 1 Yaroslav Halchenko -* 1 Yoni Ben-Meshulam -* 1 Yury V. Zaytsev -* 1 adrinjalali -* 1 ai8rahim -* 1 alemagnani -* 1 alex -* 1 benjamin wilson -* 1 chalmerlowe -* 1 dzikie drożdże -* 1 jamestwebber -* 1 matrixorz -* 1 popo -* 1 samuela -* 1 François Boulogne -* 1 Alexander Measure -* 1 Ethan White -* 1 Guilherme Trein -* 1 Hendrik Heuer -* 1 IvicaJovic -* 1 Jan Hendrik Metzen -* 1 Jean Michel Rouly -* 1 Eduardo Ariño de la Rubia -* 1 Jelle Zijlstra -* 1 Eddy L O Jansson -* 1 Denis -* 1 John -* 1 John Schmidt -* 1 Jorge Cañardo Alastuey -* 1 Joseph Perla -* 1 Joshua Vredevoogd -* 1 José Ricardo -* 1 Julien Miotte -* 1 Kemal Eren -* 1 Kenta Sato -* 1 David Cournapeau -* 1 Kyle Kelley -* 1 Daniele Medri -* 1 Laurent Luce -* 1 Laurent Pierron -* 1 Luis Pedro Coelho -* 1 DanielWeitzenfeld -* 1 Craig Thompson -* 1 Chyi-Kwei Yau -* 1 Matthew Brett -* 1 Matthias Feurer -* 1 Max Linke -* 1 Chris Filo Gorgolewski -* 1 Charles Earl -* 1 Michael Hanke -* 1 Michele Orrù -* 1 Bryan Lunt -* 1 Brian Kearns -* 1 Paul Butler -* 1 Paweł Mandera -* 1 Peter -* 1 Andrew Ash -* 1 Pietro Zambelli -* 1 staubda - - -.. _changes_0_14: - -Version 0.14 -=============== - -**August 7, 2013** - -Changelog ---------- - -- Missing values with sparse and dense matrices can be imputed with the - transformer :class:`preprocessing.Imputer` by `Nicolas Trésegnie`_. - -- The core implementation of decisions trees has been rewritten from - scratch, allowing for faster tree induction and lower memory - consumption in all tree-based estimators. By `Gilles Louppe`_. - -- Added :class:`ensemble.AdaBoostClassifier` and - :class:`ensemble.AdaBoostRegressor`, by `Noel Dawe`_ and - `Gilles Louppe`_. See the :ref:`AdaBoost ` section of the user - guide for details and examples. - -- Added :class:`grid_search.RandomizedSearchCV` and - :class:`grid_search.ParameterSampler` for randomized hyperparameter - optimization. By `Andreas Müller`_. - -- Added :ref:`biclustering ` algorithms - (:class:`sklearn.cluster.bicluster.SpectralCoclustering` and - :class:`sklearn.cluster.bicluster.SpectralBiclustering`), data - generation methods (:func:`sklearn.datasets.make_biclusters` and - :func:`sklearn.datasets.make_checkerboard`), and scoring metrics - (:func:`sklearn.metrics.consensus_score`). By `Kemal Eren`_. - -- Added :ref:`Restricted Boltzmann Machines` - (:class:`neural_network.BernoulliRBM`). By `Yann Dauphin`_. - -- Python 3 support by :user:`Justin Vincent `, `Lars Buitinck`_, - :user:`Subhodeep Moitra ` and `Olivier Grisel`_. All tests now pass under - Python 3.3. - -- Ability to pass one penalty (alpha value) per target in - :class:`linear_model.Ridge`, by @eickenberg and `Mathieu Blondel`_. - -- Fixed :mod:`sklearn.linear_model.stochastic_gradient.py` L2 regularization - issue (minor practical significance). - By :user:`Norbert Crombach ` and `Mathieu Blondel`_ . - -- Added an interactive version of `Andreas Müller`_'s - `Machine Learning Cheat Sheet (for scikit-learn) - `_ - to the documentation. See :ref:`Choosing the right estimator `. - By `Jaques Grobler`_. - -- :class:`grid_search.GridSearchCV` and - :func:`cross_validation.cross_val_score` now support the use of advanced - scoring function such as area under the ROC curve and f-beta scores. - See :ref:`scoring_parameter` for details. By `Andreas Müller`_ - and `Lars Buitinck`_. - Passing a function from :mod:`sklearn.metrics` as ``score_func`` is - deprecated. - -- Multi-label classification output is now supported by - :func:`metrics.accuracy_score`, :func:`metrics.zero_one_loss`, - :func:`metrics.f1_score`, :func:`metrics.fbeta_score`, - :func:`metrics.classification_report`, - :func:`metrics.precision_score` and :func:`metrics.recall_score` - by `Arnaud Joly`_. - -- Two new metrics :func:`metrics.hamming_loss` and - :func:`metrics.jaccard_similarity_score` - are added with multi-label support by `Arnaud Joly`_. - -- Speed and memory usage improvements in - :class:`feature_extraction.text.CountVectorizer` and - :class:`feature_extraction.text.TfidfVectorizer`, - by Jochen Wersdörfer and Roman Sinayev. - -- The ``min_df`` parameter in - :class:`feature_extraction.text.CountVectorizer` and - :class:`feature_extraction.text.TfidfVectorizer`, which used to be 2, - has been reset to 1 to avoid unpleasant surprises (empty vocabularies) - for novice users who try it out on tiny document collections. - A value of at least 2 is still recommended for practical use. - -- :class:`svm.LinearSVC`, :class:`linear_model.SGDClassifier` and - :class:`linear_model.SGDRegressor` now have a ``sparsify`` method that - converts their ``coef_`` into a sparse matrix, meaning stored models - trained using these estimators can be made much more compact. - -- :class:`linear_model.SGDClassifier` now produces multiclass probability - estimates when trained under log loss or modified Huber loss. - -- Hyperlinks to documentation in example code on the website by - :user:`Martin Luessi `. - -- Fixed bug in :class:`preprocessing.MinMaxScaler` causing incorrect scaling - of the features for non-default ``feature_range`` settings. By `Andreas - Müller`_. - -- ``max_features`` in :class:`tree.DecisionTreeClassifier`, - :class:`tree.DecisionTreeRegressor` and all derived ensemble estimators - now supports percentage values. By `Gilles Louppe`_. - -- Performance improvements in :class:`isotonic.IsotonicRegression` by - `Nelle Varoquaux`_. - -- :func:`metrics.accuracy_score` has an option normalize to return - the fraction or the number of correctly classified sample - by `Arnaud Joly`_. - -- Added :func:`metrics.log_loss` that computes log loss, aka cross-entropy - loss. By Jochen Wersdörfer and `Lars Buitinck`_. - -- A bug that caused :class:`ensemble.AdaBoostClassifier`'s to output - incorrect probabilities has been fixed. - -- Feature selectors now share a mixin providing consistent ``transform``, - ``inverse_transform`` and ``get_support`` methods. By `Joel Nothman`_. - -- A fitted :class:`grid_search.GridSearchCV` or - :class:`grid_search.RandomizedSearchCV` can now generally be pickled. - By `Joel Nothman`_. - -- Refactored and vectorized implementation of :func:`metrics.roc_curve` - and :func:`metrics.precision_recall_curve`. By `Joel Nothman`_. - -- The new estimator :class:`sklearn.decomposition.TruncatedSVD` - performs dimensionality reduction using SVD on sparse matrices, - and can be used for latent semantic analysis (LSA). - By `Lars Buitinck`_. - -- Added self-contained example of out-of-core learning on text data - :ref:`sphx_glr_auto_examples_applications_plot_out_of_core_classification.py`. - By :user:`Eustache Diemert `. - -- The default number of components for - :class:`sklearn.decomposition.RandomizedPCA` is now correctly documented - to be ``n_features``. This was the default behavior, so programs using it - will continue to work as they did. - -- :class:`sklearn.cluster.KMeans` now fits several orders of magnitude - faster on sparse data (the speedup depends on the sparsity). By - `Lars Buitinck`_. - -- Reduce memory footprint of FastICA by `Denis Engemann`_ and - `Alexandre Gramfort`_. - -- Verbose output in :mod:`sklearn.ensemble.gradient_boosting` now uses - a column format and prints progress in decreasing frequency. - It also shows the remaining time. By `Peter Prettenhofer`_. - -- :mod:`sklearn.ensemble.gradient_boosting` provides out-of-bag improvement - :attr:`~sklearn.ensemble.GradientBoostingRegressor.oob_improvement_` - rather than the OOB score for model selection. An example that shows - how to use OOB estimates to select the number of trees was added. - By `Peter Prettenhofer`_. - -- Most metrics now support string labels for multiclass classification - by `Arnaud Joly`_ and `Lars Buitinck`_. - -- New OrthogonalMatchingPursuitCV class by `Alexandre Gramfort`_ - and `Vlad Niculae`_. - -- Fixed a bug in :class:`sklearn.covariance.GraphLassoCV`: the - 'alphas' parameter now works as expected when given a list of - values. By Philippe Gervais. - -- Fixed an important bug in :class:`sklearn.covariance.GraphLassoCV` - that prevented all folds provided by a CV object to be used (only - the first 3 were used). When providing a CV object, execution - time may thus increase significantly compared to the previous - version (bug results are correct now). By Philippe Gervais. - -- :class:`cross_validation.cross_val_score` and the :mod:`grid_search` - module is now tested with multi-output data by `Arnaud Joly`_. - -- :func:`datasets.make_multilabel_classification` can now return - the output in label indicator multilabel format by `Arnaud Joly`_. - -- K-nearest neighbors, :class:`neighbors.KNeighborsRegressor` - and :class:`neighbors.RadiusNeighborsRegressor`, - and radius neighbors, :class:`neighbors.RadiusNeighborsRegressor` and - :class:`neighbors.RadiusNeighborsClassifier` support multioutput data - by `Arnaud Joly`_. - -- Random state in LibSVM-based estimators (:class:`svm.SVC`, :class:`NuSVC`, - :class:`OneClassSVM`, :class:`svm.SVR`, :class:`svm.NuSVR`) can now be - controlled. This is useful to ensure consistency in the probability - estimates for the classifiers trained with ``probability=True``. By - `Vlad Niculae`_. - -- Out-of-core learning support for discrete naive Bayes classifiers - :class:`sklearn.naive_bayes.MultinomialNB` and - :class:`sklearn.naive_bayes.BernoulliNB` by adding the ``partial_fit`` - method by `Olivier Grisel`_. - -- New website design and navigation by `Gilles Louppe`_, `Nelle Varoquaux`_, - Vincent Michel and `Andreas Müller`_. - -- Improved documentation on :ref:`multi-class, multi-label and multi-output - classification ` by `Yannick Schwartz`_ and `Arnaud Joly`_. - -- Better input and error handling in the :mod:`metrics` module by - `Arnaud Joly`_ and `Joel Nothman`_. - -- Speed optimization of the :mod:`hmm` module by :user:`Mikhail Korobov ` - -- Significant speed improvements for :class:`sklearn.cluster.DBSCAN` - by `cleverless `_ - - -API changes summary -------------------- - -- The :func:`auc_score` was renamed :func:`roc_auc_score`. - -- Testing scikit-learn with ``sklearn.test()`` is deprecated. Use - ``nosetests sklearn`` from the command line. - -- Feature importances in :class:`tree.DecisionTreeClassifier`, - :class:`tree.DecisionTreeRegressor` and all derived ensemble estimators - are now computed on the fly when accessing the ``feature_importances_`` - attribute. Setting ``compute_importances=True`` is no longer required. - By `Gilles Louppe`_. - -- :class:`linear_model.lasso_path` and - :class:`linear_model.enet_path` can return its results in the same - format as that of :class:`linear_model.lars_path`. This is done by - setting the ``return_models`` parameter to ``False``. By - `Jaques Grobler`_ and `Alexandre Gramfort`_ - -- :class:`grid_search.IterGrid` was renamed to - :class:`grid_search.ParameterGrid`. - -- Fixed bug in :class:`KFold` causing imperfect class balance in some - cases. By `Alexandre Gramfort`_ and Tadej Janež. - -- :class:`sklearn.neighbors.BallTree` has been refactored, and a - :class:`sklearn.neighbors.KDTree` has been - added which shares the same interface. The Ball Tree now works with - a wide variety of distance metrics. Both classes have many new - methods, including single-tree and dual-tree queries, breadth-first - and depth-first searching, and more advanced queries such as - kernel density estimation and 2-point correlation functions. - By `Jake Vanderplas`_ - -- Support for scipy.spatial.cKDTree within neighbors queries has been - removed, and the functionality replaced with the new :class:`KDTree` - class. - -- :class:`sklearn.neighbors.KernelDensity` has been added, which performs - efficient kernel density estimation with a variety of kernels. - -- :class:`sklearn.decomposition.KernelPCA` now always returns output with - ``n_components`` components, unless the new parameter ``remove_zero_eig`` - is set to ``True``. This new behavior is consistent with the way - kernel PCA was always documented; previously, the removal of components - with zero eigenvalues was tacitly performed on all data. - -- ``gcv_mode="auto"`` no longer tries to perform SVD on a densified - sparse matrix in :class:`sklearn.linear_model.RidgeCV`. - -- Sparse matrix support in :class:`sklearn.decomposition.RandomizedPCA` - is now deprecated in favor of the new ``TruncatedSVD``. - -- :class:`cross_validation.KFold` and - :class:`cross_validation.StratifiedKFold` now enforce `n_folds >= 2` - otherwise a ``ValueError`` is raised. By `Olivier Grisel`_. - -- :func:`datasets.load_files`'s ``charset`` and ``charset_errors`` - parameters were renamed ``encoding`` and ``decode_errors``. - -- Attribute ``oob_score_`` in :class:`sklearn.ensemble.GradientBoostingRegressor` - and :class:`sklearn.ensemble.GradientBoostingClassifier` - is deprecated and has been replaced by ``oob_improvement_`` . - -- Attributes in OrthogonalMatchingPursuit have been deprecated - (copy_X, Gram, ...) and precompute_gram renamed precompute - for consistency. See #2224. - -- :class:`sklearn.preprocessing.StandardScaler` now converts integer input - to float, and raises a warning. Previously it rounded for dense integer - input. - -- :class:`sklearn.multiclass.OneVsRestClassifier` now has a - ``decision_function`` method. This will return the distance of each - sample from the decision boundary for each class, as long as the - underlying estimators implement the ``decision_function`` method. - By `Kyle Kastner`_. - -- Better input validation, warning on unexpected shapes for y. - -People ------- -List of contributors for release 0.14 by number of commits. - - * 277 Gilles Louppe - * 245 Lars Buitinck - * 187 Andreas Mueller - * 124 Arnaud Joly - * 112 Jaques Grobler - * 109 Gael Varoquaux - * 107 Olivier Grisel - * 102 Noel Dawe - * 99 Kemal Eren - * 79 Joel Nothman - * 75 Jake VanderPlas - * 73 Nelle Varoquaux - * 71 Vlad Niculae - * 65 Peter Prettenhofer - * 64 Alexandre Gramfort - * 54 Mathieu Blondel - * 38 Nicolas Trésegnie - * 35 eustache - * 27 Denis Engemann - * 25 Yann N. Dauphin - * 19 Justin Vincent - * 17 Robert Layton - * 15 Doug Coleman - * 14 Michael Eickenberg - * 13 Robert Marchman - * 11 Fabian Pedregosa - * 11 Philippe Gervais - * 10 Jim Holmström - * 10 Tadej Janež - * 10 syhw - * 9 Mikhail Korobov - * 9 Steven De Gryze - * 8 sergeyf - * 7 Ben Root - * 7 Hrishikesh Huilgolkar - * 6 Kyle Kastner - * 6 Martin Luessi - * 6 Rob Speer - * 5 Federico Vaggi - * 5 Raul Garreta - * 5 Rob Zinkov - * 4 Ken Geis - * 3 A. Flaxman - * 3 Denton Cockburn - * 3 Dougal Sutherland - * 3 Ian Ozsvald - * 3 Johannes Schönberger - * 3 Robert McGibbon - * 3 Roman Sinayev - * 3 Szabo Roland - * 2 Diego Molla - * 2 Imran Haque - * 2 Jochen Wersdörfer - * 2 Sergey Karayev - * 2 Yannick Schwartz - * 2 jamestwebber - * 1 Abhijeet Kolhe - * 1 Alexander Fabisch - * 1 Bastiaan van den Berg - * 1 Benjamin Peterson - * 1 Daniel Velkov - * 1 Fazlul Shahriar - * 1 Felix Brockherde - * 1 Félix-Antoine Fortin - * 1 Harikrishnan S - * 1 Jack Hale - * 1 JakeMick - * 1 James McDermott - * 1 John Benediktsson - * 1 John Zwinck - * 1 Joshua Vredevoogd - * 1 Justin Pati - * 1 Kevin Hughes - * 1 Kyle Kelley - * 1 Matthias Ekman - * 1 Miroslav Shubernetskiy - * 1 Naoki Orii - * 1 Norbert Crombach - * 1 Rafael Cunha de Almeida - * 1 Rolando Espinoza La fuente - * 1 Seamus Abshere - * 1 Sergey Feldman - * 1 Sergio Medina - * 1 Stefano Lattarini - * 1 Steve Koch - * 1 Sturla Molden - * 1 Thomas Jarosch - * 1 Yaroslav Halchenko - -.. _changes_0_13_1: - -Version 0.13.1 -============== - -**February 23, 2013** - -The 0.13.1 release only fixes some bugs and does not add any new functionality. - -Changelog ---------- - -- Fixed a testing error caused by the function :func:`cross_validation.train_test_split` being - interpreted as a test by `Yaroslav Halchenko`_. - -- Fixed a bug in the reassignment of small clusters in the :class:`cluster.MiniBatchKMeans` - by `Gael Varoquaux`_. - -- Fixed default value of ``gamma`` in :class:`decomposition.KernelPCA` by `Lars Buitinck`_. - -- Updated joblib to ``0.7.0d`` by `Gael Varoquaux`_. - -- Fixed scaling of the deviance in :class:`ensemble.GradientBoostingClassifier` by `Peter Prettenhofer`_. - -- Better tie-breaking in :class:`multiclass.OneVsOneClassifier` by `Andreas Müller`_. - -- Other small improvements to tests and documentation. - -People ------- -List of contributors for release 0.13.1 by number of commits. - * 16 `Lars Buitinck`_ - * 12 `Andreas Müller`_ - * 8 `Gael Varoquaux`_ - * 5 Robert Marchman - * 3 `Peter Prettenhofer`_ - * 2 Hrishikesh Huilgolkar - * 1 Bastiaan van den Berg - * 1 Diego Molla - * 1 `Gilles Louppe`_ - * 1 `Mathieu Blondel`_ - * 1 `Nelle Varoquaux`_ - * 1 Rafael Cunha de Almeida - * 1 Rolando Espinoza La fuente - * 1 `Vlad Niculae`_ - * 1 `Yaroslav Halchenko`_ - - -.. _changes_0_13: - -Version 0.13 -============ - -**January 21, 2013** - -New Estimator Classes ---------------------- - -- :class:`dummy.DummyClassifier` and :class:`dummy.DummyRegressor`, two - data-independent predictors by `Mathieu Blondel`_. Useful to sanity-check - your estimators. See :ref:`dummy_estimators` in the user guide. - Multioutput support added by `Arnaud Joly`_. - -- :class:`decomposition.FactorAnalysis`, a transformer implementing the - classical factor analysis, by `Christian Osendorfer`_ and `Alexandre - Gramfort`_. See :ref:`FA` in the user guide. - -- :class:`feature_extraction.FeatureHasher`, a transformer implementing the - "hashing trick" for fast, low-memory feature extraction from string fields - by `Lars Buitinck`_ and :class:`feature_extraction.text.HashingVectorizer` - for text documents by `Olivier Grisel`_ See :ref:`feature_hashing` and - :ref:`hashing_vectorizer` for the documentation and sample usage. - -- :class:`pipeline.FeatureUnion`, a transformer that concatenates - results of several other transformers by `Andreas Müller`_. See - :ref:`feature_union` in the user guide. - -- :class:`random_projection.GaussianRandomProjection`, - :class:`random_projection.SparseRandomProjection` and the function - :func:`random_projection.johnson_lindenstrauss_min_dim`. The first two are - transformers implementing Gaussian and sparse random projection matrix - by `Olivier Grisel`_ and `Arnaud Joly`_. - See :ref:`random_projection` in the user guide. - -- :class:`kernel_approximation.Nystroem`, a transformer for approximating - arbitrary kernels by `Andreas Müller`_. See - :ref:`nystroem_kernel_approx` in the user guide. - -- :class:`preprocessing.OneHotEncoder`, a transformer that computes binary - encodings of categorical features by `Andreas Müller`_. See - :ref:`preprocessing_categorical_features` in the user guide. - -- :class:`linear_model.PassiveAggressiveClassifier` and - :class:`linear_model.PassiveAggressiveRegressor`, predictors implementing - an efficient stochastic optimization for linear models by `Rob Zinkov`_ and - `Mathieu Blondel`_. See :ref:`passive_aggressive` in the user - guide. - -- :class:`ensemble.RandomTreesEmbedding`, a transformer for creating high-dimensional - sparse representations using ensembles of totally random trees by `Andreas Müller`_. - See :ref:`random_trees_embedding` in the user guide. - -- :class:`manifold.SpectralEmbedding` and function - :func:`manifold.spectral_embedding`, implementing the "laplacian - eigenmaps" transformation for non-linear dimensionality reduction by Wei - Li. See :ref:`spectral_embedding` in the user guide. - -- :class:`isotonic.IsotonicRegression` by `Fabian Pedregosa`_, `Alexandre Gramfort`_ - and `Nelle Varoquaux`_, - - -Changelog ---------- - -- :func:`metrics.zero_one_loss` (formerly ``metrics.zero_one``) now has - option for normalized output that reports the fraction of - misclassifications, rather than the raw number of misclassifications. By - Kyle Beauchamp. - -- :class:`tree.DecisionTreeClassifier` and all derived ensemble models now - support sample weighting, by `Noel Dawe`_ and `Gilles Louppe`_. - -- Speedup improvement when using bootstrap samples in forests of randomized - trees, by `Peter Prettenhofer`_ and `Gilles Louppe`_. - -- Partial dependence plots for :ref:`gradient_boosting` in - :func:`ensemble.partial_dependence.partial_dependence` by `Peter - Prettenhofer`_. See :ref:`sphx_glr_auto_examples_ensemble_plot_partial_dependence.py` for an - example. - -- The table of contents on the website has now been made expandable by - `Jaques Grobler`_. - -- :class:`feature_selection.SelectPercentile` now breaks ties - deterministically instead of returning all equally ranked features. - -- :class:`feature_selection.SelectKBest` and - :class:`feature_selection.SelectPercentile` are more numerically stable - since they use scores, rather than p-values, to rank results. This means - that they might sometimes select different features than they did - previously. - -- Ridge regression and ridge classification fitting with ``sparse_cg`` solver - no longer has quadratic memory complexity, by `Lars Buitinck`_ and - `Fabian Pedregosa`_. - -- Ridge regression and ridge classification now support a new fast solver - called ``lsqr``, by `Mathieu Blondel`_. - -- Speed up of :func:`metrics.precision_recall_curve` by Conrad Lee. - -- Added support for reading/writing svmlight files with pairwise - preference attribute (qid in svmlight file format) in - :func:`datasets.dump_svmlight_file` and - :func:`datasets.load_svmlight_file` by `Fabian Pedregosa`_. - -- Faster and more robust :func:`metrics.confusion_matrix` and - :ref:`clustering_evaluation` by Wei Li. - -- :func:`cross_validation.cross_val_score` now works with precomputed kernels - and affinity matrices, by `Andreas Müller`_. - -- LARS algorithm made more numerically stable with heuristics to drop - regressors too correlated as well as to stop the path when - numerical noise becomes predominant, by `Gael Varoquaux`_. - -- Faster implementation of :func:`metrics.precision_recall_curve` by - Conrad Lee. - -- New kernel :class:`metrics.chi2_kernel` by `Andreas Müller`_, often used - in computer vision applications. - -- Fix of longstanding bug in :class:`naive_bayes.BernoulliNB` fixed by - Shaun Jackman. - -- Implemented ``predict_proba`` in :class:`multiclass.OneVsRestClassifier`, - by Andrew Winterman. - -- Improve consistency in gradient boosting: estimators - :class:`ensemble.GradientBoostingRegressor` and - :class:`ensemble.GradientBoostingClassifier` use the estimator - :class:`tree.DecisionTreeRegressor` instead of the - :class:`tree._tree.Tree` data structure by `Arnaud Joly`_. - -- Fixed a floating point exception in the :ref:`decision trees ` - module, by Seberg. - -- Fix :func:`metrics.roc_curve` fails when y_true has only one class - by Wei Li. - -- Add the :func:`metrics.mean_absolute_error` function which computes the - mean absolute error. The :func:`metrics.mean_squared_error`, - :func:`metrics.mean_absolute_error` and - :func:`metrics.r2_score` metrics support multioutput by `Arnaud Joly`_. - -- Fixed ``class_weight`` support in :class:`svm.LinearSVC` and - :class:`linear_model.LogisticRegression` by `Andreas Müller`_. The meaning - of ``class_weight`` was reversed as erroneously higher weight meant less - positives of a given class in earlier releases. - -- Improve narrative documentation and consistency in - :mod:`sklearn.metrics` for regression and classification metrics - by `Arnaud Joly`_. - -- Fixed a bug in :class:`sklearn.svm.SVC` when using csr-matrices with - unsorted indices by Xinfan Meng and `Andreas Müller`_. - -- :class:`MiniBatchKMeans`: Add random reassignment of cluster centers - with little observations attached to them, by `Gael Varoquaux`_. - - -API changes summary -------------------- -- Renamed all occurrences of ``n_atoms`` to ``n_components`` for consistency. - This applies to :class:`decomposition.DictionaryLearning`, - :class:`decomposition.MiniBatchDictionaryLearning`, - :func:`decomposition.dict_learning`, :func:`decomposition.dict_learning_online`. - -- Renamed all occurrences of ``max_iters`` to ``max_iter`` for consistency. - This applies to :class:`semi_supervised.LabelPropagation` and - :class:`semi_supervised.label_propagation.LabelSpreading`. - -- Renamed all occurrences of ``learn_rate`` to ``learning_rate`` for - consistency in :class:`ensemble.BaseGradientBoosting` and - :class:`ensemble.GradientBoostingRegressor`. - -- The module ``sklearn.linear_model.sparse`` is gone. Sparse matrix support - was already integrated into the "regular" linear models. - -- :func:`sklearn.metrics.mean_square_error`, which incorrectly returned the - accumulated error, was removed. Use ``mean_squared_error`` instead. - -- Passing ``class_weight`` parameters to ``fit`` methods is no longer - supported. Pass them to estimator constructors instead. - -- GMMs no longer have ``decode`` and ``rvs`` methods. Use the ``score``, - ``predict`` or ``sample`` methods instead. - -- The ``solver`` fit option in Ridge regression and classification is now - deprecated and will be removed in v0.14. Use the constructor option - instead. - -- :class:`feature_extraction.text.DictVectorizer` now returns sparse - matrices in the CSR format, instead of COO. - -- Renamed ``k`` in :class:`cross_validation.KFold` and - :class:`cross_validation.StratifiedKFold` to ``n_folds``, renamed - ``n_bootstraps`` to ``n_iter`` in ``cross_validation.Bootstrap``. - -- Renamed all occurrences of ``n_iterations`` to ``n_iter`` for consistency. - This applies to :class:`cross_validation.ShuffleSplit`, - :class:`cross_validation.StratifiedShuffleSplit`, - :func:`utils.randomized_range_finder` and :func:`utils.randomized_svd`. - -- Replaced ``rho`` in :class:`linear_model.ElasticNet` and - :class:`linear_model.SGDClassifier` by ``l1_ratio``. The ``rho`` parameter - had different meanings; ``l1_ratio`` was introduced to avoid confusion. - It has the same meaning as previously ``rho`` in - :class:`linear_model.ElasticNet` and ``(1-rho)`` in - :class:`linear_model.SGDClassifier`. - -- :class:`linear_model.LassoLars` and :class:`linear_model.Lars` now - store a list of paths in the case of multiple targets, rather than - an array of paths. - -- The attribute ``gmm`` of :class:`hmm.GMMHMM` was renamed to ``gmm_`` - to adhere more strictly with the API. - -- :func:`cluster.spectral_embedding` was moved to - :func:`manifold.spectral_embedding`. - -- Renamed ``eig_tol`` in :func:`manifold.spectral_embedding`, - :class:`cluster.SpectralClustering` to ``eigen_tol``, renamed ``mode`` - to ``eigen_solver``. - -- Renamed ``mode`` in :func:`manifold.spectral_embedding` and - :class:`cluster.SpectralClustering` to ``eigen_solver``. - -- ``classes_`` and ``n_classes_`` attributes of - :class:`tree.DecisionTreeClassifier` and all derived ensemble models are - now flat in case of single output problems and nested in case of - multi-output problems. - -- The ``estimators_`` attribute of - :class:`ensemble.gradient_boosting.GradientBoostingRegressor` and - :class:`ensemble.gradient_boosting.GradientBoostingClassifier` is now an - array of :class:'tree.DecisionTreeRegressor'. - -- Renamed ``chunk_size`` to ``batch_size`` in - :class:`decomposition.MiniBatchDictionaryLearning` and - :class:`decomposition.MiniBatchSparsePCA` for consistency. - -- :class:`svm.SVC` and :class:`svm.NuSVC` now provide a ``classes_`` - attribute and support arbitrary dtypes for labels ``y``. - Also, the dtype returned by ``predict`` now reflects the dtype of - ``y`` during ``fit`` (used to be ``np.float``). - -- Changed default test_size in :func:`cross_validation.train_test_split` - to None, added possibility to infer ``test_size`` from ``train_size`` in - :class:`cross_validation.ShuffleSplit` and - :class:`cross_validation.StratifiedShuffleSplit`. - -- Renamed function :func:`sklearn.metrics.zero_one` to - :func:`sklearn.metrics.zero_one_loss`. Be aware that the default behavior - in :func:`sklearn.metrics.zero_one_loss` is different from - :func:`sklearn.metrics.zero_one`: ``normalize=False`` is changed to - ``normalize=True``. - -- Renamed function :func:`metrics.zero_one_score` to - :func:`metrics.accuracy_score`. - -- :func:`datasets.make_circles` now has the same number of inner and outer points. - -- In the Naive Bayes classifiers, the ``class_prior`` parameter was moved - from ``fit`` to ``__init__``. - -People ------- -List of contributors for release 0.13 by number of commits. - - * 364 `Andreas Müller`_ - * 143 `Arnaud Joly`_ - * 137 `Peter Prettenhofer`_ - * 131 `Gael Varoquaux`_ - * 117 `Mathieu Blondel`_ - * 108 `Lars Buitinck`_ - * 106 Wei Li - * 101 `Olivier Grisel`_ - * 65 `Vlad Niculae`_ - * 54 `Gilles Louppe`_ - * 40 `Jaques Grobler`_ - * 38 `Alexandre Gramfort`_ - * 30 `Rob Zinkov`_ - * 19 Aymeric Masurelle - * 18 Andrew Winterman - * 17 `Fabian Pedregosa`_ - * 17 Nelle Varoquaux - * 16 `Christian Osendorfer`_ - * 14 `Daniel Nouri`_ - * 13 :user:`Virgile Fritsch ` - * 13 syhw - * 12 `Satrajit Ghosh`_ - * 10 Corey Lynch - * 10 Kyle Beauchamp - * 9 Brian Cheung - * 9 Immanuel Bayer - * 9 mr.Shu - * 8 Conrad Lee - * 8 `James Bergstra`_ - * 7 Tadej Janež - * 6 Brian Cajes - * 6 `Jake Vanderplas`_ - * 6 Michael - * 6 Noel Dawe - * 6 Tiago Nunes - * 6 cow - * 5 Anze - * 5 Shiqiao Du - * 4 Christian Jauvin - * 4 Jacques Kvam - * 4 Richard T. Guy - * 4 `Robert Layton`_ - * 3 Alexandre Abraham - * 3 Doug Coleman - * 3 Scott Dickerson - * 2 ApproximateIdentity - * 2 John Benediktsson - * 2 Mark Veronda - * 2 Matti Lyra - * 2 Mikhail Korobov - * 2 Xinfan Meng - * 1 Alejandro Weinstein - * 1 `Alexandre Passos`_ - * 1 Christoph Deil - * 1 Eugene Nizhibitsky - * 1 Kenneth C. Arnold - * 1 Luis Pedro Coelho - * 1 Miroslav Batchkarov - * 1 Pavel - * 1 Sebastian Berg - * 1 Shaun Jackman - * 1 Subhodeep Moitra - * 1 bob - * 1 dengemann - * 1 emanuele - * 1 x006 - - -.. _changes_0_12.1: - -Version 0.12.1 -=============== - -**October 8, 2012** - -The 0.12.1 release is a bug-fix release with no additional features, but is -instead a set of bug fixes - -Changelog ----------- - -- Improved numerical stability in spectral embedding by `Gael - Varoquaux`_ - -- Doctest under windows 64bit by `Gael Varoquaux`_ - -- Documentation fixes for elastic net by `Andreas Müller`_ and - `Alexandre Gramfort`_ - -- Proper behavior with fortran-ordered NumPy arrays by `Gael Varoquaux`_ - -- Make GridSearchCV work with non-CSR sparse matrix by `Lars Buitinck`_ - -- Fix parallel computing in MDS by `Gael Varoquaux`_ - -- Fix Unicode support in count vectorizer by `Andreas Müller`_ - -- Fix MinCovDet breaking with X.shape = (3, 1) by :user:`Virgile Fritsch ` - -- Fix clone of SGD objects by `Peter Prettenhofer`_ - -- Stabilize GMM by :user:`Virgile Fritsch ` - -People ------- - - * 14 `Peter Prettenhofer`_ - * 12 `Gael Varoquaux`_ - * 10 `Andreas Müller`_ - * 5 `Lars Buitinck`_ - * 3 :user:`Virgile Fritsch ` - * 1 `Alexandre Gramfort`_ - * 1 `Gilles Louppe`_ - * 1 `Mathieu Blondel`_ - -.. _changes_0_12: - -Version 0.12 -============ - -**September 4, 2012** - -Changelog ---------- - -- Various speed improvements of the :ref:`decision trees ` module, by - `Gilles Louppe`_. - -- :class:`ensemble.GradientBoostingRegressor` and - :class:`ensemble.GradientBoostingClassifier` now support feature subsampling - via the ``max_features`` argument, by `Peter Prettenhofer`_. - -- Added Huber and Quantile loss functions to - :class:`ensemble.GradientBoostingRegressor`, by `Peter Prettenhofer`_. - -- :ref:`Decision trees ` and :ref:`forests of randomized trees ` - now support multi-output classification and regression problems, by - `Gilles Louppe`_. - -- Added :class:`preprocessing.LabelEncoder`, a simple utility class to - normalize labels or transform non-numerical labels, by `Mathieu Blondel`_. - -- Added the epsilon-insensitive loss and the ability to make probabilistic - predictions with the modified huber loss in :ref:`sgd`, by - `Mathieu Blondel`_. - -- Added :ref:`multidimensional_scaling`, by Nelle Varoquaux. - -- SVMlight file format loader now detects compressed (gzip/bzip2) files and - decompresses them on the fly, by `Lars Buitinck`_. - -- SVMlight file format serializer now preserves double precision floating - point values, by `Olivier Grisel`_. - -- A common testing framework for all estimators was added, by `Andreas Müller`_. - -- Understandable error messages for estimators that do not accept - sparse input by `Gael Varoquaux`_ - -- Speedups in hierarchical clustering by `Gael Varoquaux`_. In - particular building the tree now supports early stopping. This is - useful when the number of clusters is not small compared to the - number of samples. - -- Add MultiTaskLasso and MultiTaskElasticNet for joint feature selection, - by `Alexandre Gramfort`_. - -- Added :func:`metrics.auc_score` and - :func:`metrics.average_precision_score` convenience functions by `Andreas - Müller`_. - -- Improved sparse matrix support in the :ref:`feature_selection` - module by `Andreas Müller`_. - -- New word boundaries-aware character n-gram analyzer for the - :ref:`text_feature_extraction` module by :user:`@kernc `. - -- Fixed bug in spectral clustering that led to single point clusters - by `Andreas Müller`_. - -- In :class:`feature_extraction.text.CountVectorizer`, added an option to - ignore infrequent words, ``min_df`` by `Andreas Müller`_. - -- Add support for multiple targets in some linear models (ElasticNet, Lasso - and OrthogonalMatchingPursuit) by `Vlad Niculae`_ and - `Alexandre Gramfort`_. - -- Fixes in :class:`decomposition.ProbabilisticPCA` score function by Wei Li. - -- Fixed feature importance computation in - :ref:`gradient_boosting`. - -API changes summary -------------------- - -- The old ``scikits.learn`` package has disappeared; all code should import - from ``sklearn`` instead, which was introduced in 0.9. - -- In :func:`metrics.roc_curve`, the ``thresholds`` array is now returned - with it's order reversed, in order to keep it consistent with the order - of the returned ``fpr`` and ``tpr``. - -- In :class:`hmm` objects, like :class:`hmm.GaussianHMM`, - :class:`hmm.MultinomialHMM`, etc., all parameters must be passed to the - object when initialising it and not through ``fit``. Now ``fit`` will - only accept the data as an input parameter. - -- For all SVM classes, a faulty behavior of ``gamma`` was fixed. Previously, - the default gamma value was only computed the first time ``fit`` was called - and then stored. It is now recalculated on every call to ``fit``. - -- All ``Base`` classes are now abstract meta classes so that they can not be - instantiated. - -- :func:`cluster.ward_tree` now also returns the parent array. This is - necessary for early-stopping in which case the tree is not - completely built. - -- In :class:`feature_extraction.text.CountVectorizer` the parameters - ``min_n`` and ``max_n`` were joined to the parameter ``n_gram_range`` to - enable grid-searching both at once. - -- In :class:`feature_extraction.text.CountVectorizer`, words that appear - only in one document are now ignored by default. To reproduce - the previous behavior, set ``min_df=1``. - -- Fixed API inconsistency: :meth:`linear_model.SGDClassifier.predict_proba` now - returns 2d array when fit on two classes. - -- Fixed API inconsistency: :meth:`discriminant_analysis.QuadraticDiscriminantAnalysis.decision_function` - and :meth:`discriminant_analysis.LinearDiscriminantAnalysis.decision_function` now return 1d arrays - when fit on two classes. - -- Grid of alphas used for fitting :class:`linear_model.LassoCV` and - :class:`linear_model.ElasticNetCV` is now stored - in the attribute ``alphas_`` rather than overriding the init parameter - ``alphas``. - -- Linear models when alpha is estimated by cross-validation store - the estimated value in the ``alpha_`` attribute rather than just - ``alpha`` or ``best_alpha``. - -- :class:`ensemble.GradientBoostingClassifier` now supports - :meth:`ensemble.GradientBoostingClassifier.staged_predict_proba`, and - :meth:`ensemble.GradientBoostingClassifier.staged_predict`. - -- :class:`svm.sparse.SVC` and other sparse SVM classes are now deprecated. - The all classes in the :ref:`svm` module now automatically select the - sparse or dense representation base on the input. - -- All clustering algorithms now interpret the array ``X`` given to ``fit`` as - input data, in particular :class:`cluster.SpectralClustering` and - :class:`cluster.AffinityPropagation` which previously expected affinity matrices. - -- For clustering algorithms that take the desired number of clusters as a parameter, - this parameter is now called ``n_clusters``. - - -People ------- - * 267 `Andreas Müller`_ - * 94 `Gilles Louppe`_ - * 89 `Gael Varoquaux`_ - * 79 `Peter Prettenhofer`_ - * 60 `Mathieu Blondel`_ - * 57 `Alexandre Gramfort`_ - * 52 `Vlad Niculae`_ - * 45 `Lars Buitinck`_ - * 44 Nelle Varoquaux - * 37 `Jaques Grobler`_ - * 30 Alexis Mignon - * 30 Immanuel Bayer - * 27 `Olivier Grisel`_ - * 16 Subhodeep Moitra - * 13 Yannick Schwartz - * 12 :user:`@kernc ` - * 11 :user:`Virgile Fritsch ` - * 9 Daniel Duckworth - * 9 `Fabian Pedregosa`_ - * 9 `Robert Layton`_ - * 8 John Benediktsson - * 7 Marko Burjek - * 5 `Nicolas Pinto`_ - * 4 Alexandre Abraham - * 4 `Jake Vanderplas`_ - * 3 `Brian Holt`_ - * 3 `Edouard Duchesnay`_ - * 3 Florian Hoenig - * 3 flyingimmidev - * 2 Francois Savard - * 2 Hannes Schulz - * 2 Peter Welinder - * 2 `Yaroslav Halchenko`_ - * 2 Wei Li - * 1 Alex Companioni - * 1 Brandyn A. White - * 1 Bussonnier Matthias - * 1 Charles-Pierre Astolfi - * 1 Dan O'Huiginn - * 1 David Cournapeau - * 1 Keith Goodman - * 1 Ludwig Schwardt - * 1 Olivier Hervieu - * 1 Sergio Medina - * 1 Shiqiao Du - * 1 Tim Sheerman-Chase - * 1 buguen - - - -.. _changes_0_11: - -Version 0.11 -============ - -**May 7, 2012** - -Changelog ---------- - -Highlights -............. - -- Gradient boosted regression trees (:ref:`gradient_boosting`) - for classification and regression by `Peter Prettenhofer`_ - and `Scott White`_ . - -- Simple dict-based feature loader with support for categorical variables - (:class:`feature_extraction.DictVectorizer`) by `Lars Buitinck`_. - -- Added Matthews correlation coefficient (:func:`metrics.matthews_corrcoef`) - and added macro and micro average options to - :func:`metrics.precision_score`, :func:`metrics.recall_score` and - :func:`metrics.f1_score` by `Satrajit Ghosh`_. - -- :ref:`out_of_bag` of generalization error for :ref:`ensemble` - by `Andreas Müller`_. - -- Randomized sparse linear models for feature - selection, by `Alexandre Gramfort`_ and `Gael Varoquaux`_ - -- :ref:`label_propagation` for semi-supervised learning, by Clay - Woolam. **Note** the semi-supervised API is still work in progress, - and may change. - -- Added BIC/AIC model selection to classical :ref:`gmm` and unified - the API with the remainder of scikit-learn, by `Bertrand Thirion`_ - -- Added :class:`sklearn.cross_validation.StratifiedShuffleSplit`, which is - a :class:`sklearn.cross_validation.ShuffleSplit` with balanced splits, - by Yannick Schwartz. - -- :class:`sklearn.neighbors.NearestCentroid` classifier added, along with a - ``shrink_threshold`` parameter, which implements **shrunken centroid - classification**, by `Robert Layton`_. - -Other changes -.............. - -- Merged dense and sparse implementations of :ref:`sgd` module and - exposed utility extension types for sequential - datasets ``seq_dataset`` and weight vectors ``weight_vector`` - by `Peter Prettenhofer`_. - -- Added ``partial_fit`` (support for online/minibatch learning) and - warm_start to the :ref:`sgd` module by `Mathieu Blondel`_. - -- Dense and sparse implementations of :ref:`svm` classes and - :class:`linear_model.LogisticRegression` merged by `Lars Buitinck`_. - -- Regressors can now be used as base estimator in the :ref:`multiclass` - module by `Mathieu Blondel`_. - -- Added n_jobs option to :func:`metrics.pairwise.pairwise_distances` - and :func:`metrics.pairwise.pairwise_kernels` for parallel computation, - by `Mathieu Blondel`_. - -- :ref:`k_means` can now be run in parallel, using the ``n_jobs`` argument - to either :ref:`k_means` or :class:`KMeans`, by `Robert Layton`_. - -- Improved :ref:`cross_validation` and :ref:`grid_search` documentation - and introduced the new :func:`cross_validation.train_test_split` - helper function by `Olivier Grisel`_ - -- :class:`svm.SVC` members ``coef_`` and ``intercept_`` changed sign for - consistency with ``decision_function``; for ``kernel==linear``, - ``coef_`` was fixed in the one-vs-one case, by `Andreas Müller`_. - -- Performance improvements to efficient leave-one-out cross-validated - Ridge regression, esp. for the ``n_samples > n_features`` case, in - :class:`linear_model.RidgeCV`, by Reuben Fletcher-Costin. - -- Refactoring and simplification of the :ref:`text_feature_extraction` - API and fixed a bug that caused possible negative IDF, - by `Olivier Grisel`_. - -- Beam pruning option in :class:`_BaseHMM` module has been removed since it - is difficult to Cythonize. If you are interested in contributing a Cython - version, you can use the python version in the git history as a reference. - -- Classes in :ref:`neighbors` now support arbitrary Minkowski metric for - nearest neighbors searches. The metric can be specified by argument ``p``. - -API changes summary -------------------- - -- :class:`covariance.EllipticEnvelop` is now deprecated - Please use :class:`covariance.EllipticEnvelope` - instead. - -- ``NeighborsClassifier`` and ``NeighborsRegressor`` are gone in the module - :ref:`neighbors`. Use the classes :class:`KNeighborsClassifier`, - :class:`RadiusNeighborsClassifier`, :class:`KNeighborsRegressor` - and/or :class:`RadiusNeighborsRegressor` instead. - -- Sparse classes in the :ref:`sgd` module are now deprecated. - -- In :class:`mixture.GMM`, :class:`mixture.DPGMM` and :class:`mixture.VBGMM`, - parameters must be passed to an object when initialising it and not through - ``fit``. Now ``fit`` will only accept the data as an input parameter. - -- methods ``rvs`` and ``decode`` in :class:`GMM` module are now deprecated. - ``sample`` and ``score`` or ``predict`` should be used instead. - -- attribute ``_scores`` and ``_pvalues`` in univariate feature selection - objects are now deprecated. - ``scores_`` or ``pvalues_`` should be used instead. - -- In :class:`LogisticRegression`, :class:`LinearSVC`, :class:`SVC` and - :class:`NuSVC`, the ``class_weight`` parameter is now an initialization - parameter, not a parameter to fit. This makes grid searches - over this parameter possible. - -- LFW ``data`` is now always shape ``(n_samples, n_features)`` to be - consistent with the Olivetti faces dataset. Use ``images`` and - ``pairs`` attribute to access the natural images shapes instead. - -- In :class:`svm.LinearSVC`, the meaning of the ``multi_class`` parameter - changed. Options now are ``'ovr'`` and ``'crammer_singer'``, with - ``'ovr'`` being the default. This does not change the default behavior - but hopefully is less confusing. - -- Class :class:`feature_selection.text.Vectorizer` is deprecated and - replaced by :class:`feature_selection.text.TfidfVectorizer`. - -- The preprocessor / analyzer nested structure for text feature - extraction has been removed. All those features are - now directly passed as flat constructor arguments - to :class:`feature_selection.text.TfidfVectorizer` and - :class:`feature_selection.text.CountVectorizer`, in particular the - following parameters are now used: - -- ``analyzer`` can be ``'word'`` or ``'char'`` to switch the default - analysis scheme, or use a specific python callable (as previously). - -- ``tokenizer`` and ``preprocessor`` have been introduced to make it - still possible to customize those steps with the new API. - -- ``input`` explicitly control how to interpret the sequence passed to - ``fit`` and ``predict``: filenames, file objects or direct (byte or - Unicode) strings. - -- charset decoding is explicit and strict by default. - -- the ``vocabulary``, fitted or not is now stored in the - ``vocabulary_`` attribute to be consistent with the project - conventions. - -- Class :class:`feature_selection.text.TfidfVectorizer` now derives directly - from :class:`feature_selection.text.CountVectorizer` to make grid - search trivial. - -- methods ``rvs`` in :class:`_BaseHMM` module are now deprecated. - ``sample`` should be used instead. - -- Beam pruning option in :class:`_BaseHMM` module is removed since it is - difficult to be Cythonized. If you are interested, you can look in the - history codes by git. - -- The SVMlight format loader now supports files with both zero-based and - one-based column indices, since both occur "in the wild". - -- Arguments in class :class:`ShuffleSplit` are now consistent with - :class:`StratifiedShuffleSplit`. Arguments ``test_fraction`` and - ``train_fraction`` are deprecated and renamed to ``test_size`` and - ``train_size`` and can accept both ``float`` and ``int``. - -- Arguments in class :class:`Bootstrap` are now consistent with - :class:`StratifiedShuffleSplit`. Arguments ``n_test`` and - ``n_train`` are deprecated and renamed to ``test_size`` and - ``train_size`` and can accept both ``float`` and ``int``. - -- Argument ``p`` added to classes in :ref:`neighbors` to specify an - arbitrary Minkowski metric for nearest neighbors searches. - - -People ------- - * 282 `Andreas Müller`_ - * 239 `Peter Prettenhofer`_ - * 198 `Gael Varoquaux`_ - * 129 `Olivier Grisel`_ - * 114 `Mathieu Blondel`_ - * 103 Clay Woolam - * 96 `Lars Buitinck`_ - * 88 `Jaques Grobler`_ - * 82 `Alexandre Gramfort`_ - * 50 `Bertrand Thirion`_ - * 42 `Robert Layton`_ - * 28 flyingimmidev - * 26 `Jake Vanderplas`_ - * 26 Shiqiao Du - * 21 `Satrajit Ghosh`_ - * 17 `David Marek`_ - * 17 `Gilles Louppe`_ - * 14 `Vlad Niculae`_ - * 11 Yannick Schwartz - * 10 `Fabian Pedregosa`_ - * 9 fcostin - * 7 Nick Wilson - * 5 Adrien Gaidon - * 5 `Nicolas Pinto`_ - * 4 `David Warde-Farley`_ - * 5 Nelle Varoquaux - * 5 Emmanuelle Gouillart - * 3 Joonas Sillanpää - * 3 Paolo Losi - * 2 Charles McCarthy - * 2 Roy Hyunjin Han - * 2 Scott White - * 2 ibayer - * 1 Brandyn White - * 1 Carlos Scheidegger - * 1 Claire Revillet - * 1 Conrad Lee - * 1 `Edouard Duchesnay`_ - * 1 Jan Hendrik Metzen - * 1 Meng Xinfan - * 1 `Rob Zinkov`_ - * 1 Shiqiao - * 1 Udi Weinsberg - * 1 Virgile Fritsch - * 1 Xinfan Meng - * 1 Yaroslav Halchenko - * 1 jansoe - * 1 Leon Palafox - - -.. _changes_0_10: - -Version 0.10 -============ - -**January 11, 2012** - -Changelog ---------- - -- Python 2.5 compatibility was dropped; the minimum Python version needed - to use scikit-learn is now 2.6. - -- :ref:`sparse_inverse_covariance` estimation using the graph Lasso, with - associated cross-validated estimator, by `Gael Varoquaux`_ - -- New :ref:`Tree ` module by `Brian Holt`_, `Peter Prettenhofer`_, - `Satrajit Ghosh`_ and `Gilles Louppe`_. The module comes with complete - documentation and examples. - -- Fixed a bug in the RFE module by `Gilles Louppe`_ (issue #378). - -- Fixed a memory leak in :ref:`svm` module by `Brian Holt`_ (issue #367). - -- Faster tests by `Fabian Pedregosa`_ and others. - -- Silhouette Coefficient cluster analysis evaluation metric added as - :func:`sklearn.metrics.silhouette_score` by Robert Layton. - -- Fixed a bug in :ref:`k_means` in the handling of the ``n_init`` parameter: - the clustering algorithm used to be run ``n_init`` times but the last - solution was retained instead of the best solution by `Olivier Grisel`_. - -- Minor refactoring in :ref:`sgd` module; consolidated dense and sparse - predict methods; Enhanced test time performance by converting model - parameters to fortran-style arrays after fitting (only multi-class). - -- Adjusted Mutual Information metric added as - :func:`sklearn.metrics.adjusted_mutual_info_score` by Robert Layton. - -- Models like SVC/SVR/LinearSVC/LogisticRegression from libsvm/liblinear - now support scaling of C regularization parameter by the number of - samples by `Alexandre Gramfort`_. - -- New :ref:`Ensemble Methods ` module by `Gilles Louppe`_ and - `Brian Holt`_. The module comes with the random forest algorithm and the - extra-trees method, along with documentation and examples. - -- :ref:`outlier_detection`: outlier and novelty detection, by - :user:`Virgile Fritsch `. - -- :ref:`kernel_approximation`: a transform implementing kernel - approximation for fast SGD on non-linear kernels by - `Andreas Müller`_. - -- Fixed a bug due to atom swapping in :ref:`OMP` by `Vlad Niculae`_. - -- :ref:`SparseCoder` by `Vlad Niculae`_. - -- :ref:`mini_batch_kmeans` performance improvements by `Olivier Grisel`_. - -- :ref:`k_means` support for sparse matrices by `Mathieu Blondel`_. - -- Improved documentation for developers and for the :mod:`sklearn.utils` - module, by `Jake Vanderplas`_. - -- Vectorized 20newsgroups dataset loader - (:func:`sklearn.datasets.fetch_20newsgroups_vectorized`) by - `Mathieu Blondel`_. - -- :ref:`multiclass` by `Lars Buitinck`_. - -- Utilities for fast computation of mean and variance for sparse matrices - by `Mathieu Blondel`_. - -- Make :func:`sklearn.preprocessing.scale` and - :class:`sklearn.preprocessing.Scaler` work on sparse matrices by - `Olivier Grisel`_ - -- Feature importances using decision trees and/or forest of trees, - by `Gilles Louppe`_. - -- Parallel implementation of forests of randomized trees by - `Gilles Louppe`_. - -- :class:`sklearn.cross_validation.ShuffleSplit` can subsample the train - sets as well as the test sets by `Olivier Grisel`_. - -- Errors in the build of the documentation fixed by `Andreas Müller`_. - - -API changes summary -------------------- - -Here are the code migration instructions when upgrading from scikit-learn -version 0.9: - -- Some estimators that may overwrite their inputs to save memory previously - had ``overwrite_`` parameters; these have been replaced with ``copy_`` - parameters with exactly the opposite meaning. - - This particularly affects some of the estimators in :mod:`linear_model`. - The default behavior is still to copy everything passed in. - -- The SVMlight dataset loader :func:`sklearn.datasets.load_svmlight_file` no - longer supports loading two files at once; use ``load_svmlight_files`` - instead. Also, the (unused) ``buffer_mb`` parameter is gone. - -- Sparse estimators in the :ref:`sgd` module use dense parameter vector - ``coef_`` instead of ``sparse_coef_``. This significantly improves - test time performance. - -- The :ref:`covariance` module now has a robust estimator of - covariance, the Minimum Covariance Determinant estimator. - -- Cluster evaluation metrics in :mod:`metrics.cluster` have been refactored - but the changes are backwards compatible. They have been moved to the - :mod:`metrics.cluster.supervised`, along with - :mod:`metrics.cluster.unsupervised` which contains the Silhouette - Coefficient. - -- The ``permutation_test_score`` function now behaves the same way as - ``cross_val_score`` (i.e. uses the mean score across the folds.) - -- Cross Validation generators now use integer indices (``indices=True``) - by default instead of boolean masks. This make it more intuitive to - use with sparse matrix data. - -- The functions used for sparse coding, ``sparse_encode`` and - ``sparse_encode_parallel`` have been combined into - :func:`sklearn.decomposition.sparse_encode`, and the shapes of the arrays - have been transposed for consistency with the matrix factorization setting, - as opposed to the regression setting. - -- Fixed an off-by-one error in the SVMlight/LibSVM file format handling; - files generated using :func:`sklearn.datasets.dump_svmlight_file` should be - re-generated. (They should continue to work, but accidentally had one - extra column of zeros prepended.) - -- ``BaseDictionaryLearning`` class replaced by ``SparseCodingMixin``. - -- :func:`sklearn.utils.extmath.fast_svd` has been renamed - :func:`sklearn.utils.extmath.randomized_svd` and the default - oversampling is now fixed to 10 additional random vectors instead - of doubling the number of components to extract. The new behavior - follows the reference paper. - - -People ------- - -The following people contributed to scikit-learn since last release: - - * 246 `Andreas Müller`_ - * 242 `Olivier Grisel`_ - * 220 `Gilles Louppe`_ - * 183 `Brian Holt`_ - * 166 `Gael Varoquaux`_ - * 144 `Lars Buitinck`_ - * 73 `Vlad Niculae`_ - * 65 `Peter Prettenhofer`_ - * 64 `Fabian Pedregosa`_ - * 60 Robert Layton - * 55 `Mathieu Blondel`_ - * 52 `Jake Vanderplas`_ - * 44 Noel Dawe - * 38 `Alexandre Gramfort`_ - * 24 :user:`Virgile Fritsch ` - * 23 `Satrajit Ghosh`_ - * 3 Jan Hendrik Metzen - * 3 Kenneth C. Arnold - * 3 Shiqiao Du - * 3 Tim Sheerman-Chase - * 3 `Yaroslav Halchenko`_ - * 2 Bala Subrahmanyam Varanasi - * 2 DraXus - * 2 Michael Eickenberg - * 1 Bogdan Trach - * 1 Félix-Antoine Fortin - * 1 Juan Manuel Caicedo Carvajal - * 1 Nelle Varoquaux - * 1 `Nicolas Pinto`_ - * 1 Tiziano Zito - * 1 Xinfan Meng - - - -.. _changes_0_9: - -Version 0.9 -=========== - -**September 21, 2011** - -scikit-learn 0.9 was released on September 2011, three months after the 0.8 -release and includes the new modules :ref:`manifold`, :ref:`dirichlet_process` -as well as several new algorithms and documentation improvements. - -This release also includes the dictionary-learning work developed by -`Vlad Niculae`_ as part of the `Google Summer of Code -`_ program. - - - -.. |banner1| image:: ./auto_examples/manifold/images/thumb/sphx_glr_plot_compare_methods_thumb.png - :target: auto_examples/manifold/plot_compare_methods.html - -.. |banner2| image:: ./auto_examples/linear_model/images/thumb/sphx_glr_plot_omp_thumb.png - :target: auto_examples/linear_model/plot_omp.html - -.. |banner3| image:: ./auto_examples/decomposition/images/thumb/sphx_glr_plot_kernel_pca_thumb.png - :target: auto_examples/decomposition/plot_kernel_pca.html - -.. |center-div| raw:: html - -
- -.. |end-div| raw:: html - -
- - -|center-div| |banner2| |banner1| |banner3| |end-div| - -Changelog ---------- - -- New :ref:`manifold` module by `Jake Vanderplas`_ and - `Fabian Pedregosa`_. - -- New :ref:`Dirichlet Process ` Gaussian Mixture - Model by `Alexandre Passos`_ - -- :ref:`neighbors` module refactoring by `Jake Vanderplas`_ : - general refactoring, support for sparse matrices in input, speed and - documentation improvements. See the next section for a full list of API - changes. - -- Improvements on the :ref:`feature_selection` module by - `Gilles Louppe`_ : refactoring of the RFE classes, documentation - rewrite, increased efficiency and minor API changes. - -- :ref:`SparsePCA` by `Vlad Niculae`_, `Gael Varoquaux`_ and - `Alexandre Gramfort`_ - -- Printing an estimator now behaves independently of architectures - and Python version thanks to :user:`Jean Kossaifi `. - -- :ref:`Loader for libsvm/svmlight format ` by - `Mathieu Blondel`_ and `Lars Buitinck`_ - -- Documentation improvements: thumbnails in - example gallery by `Fabian Pedregosa`_. - -- Important bugfixes in :ref:`svm` module (segfaults, bad - performance) by `Fabian Pedregosa`_. - -- Added :ref:`multinomial_naive_bayes` and :ref:`bernoulli_naive_bayes` - by `Lars Buitinck`_ - -- Text feature extraction optimizations by Lars Buitinck - -- Chi-Square feature selection - (:func:`feature_selection.univariate_selection.chi2`) by `Lars Buitinck`_. - -- :ref:`sample_generators` module refactoring by `Gilles Louppe`_ - -- :ref:`multiclass` by `Mathieu Blondel`_ - -- Ball tree rewrite by `Jake Vanderplas`_ - -- Implementation of :ref:`dbscan` algorithm by Robert Layton - -- Kmeans predict and transform by Robert Layton - -- Preprocessing module refactoring by `Olivier Grisel`_ - -- Faster mean shift by Conrad Lee - -- New ``Bootstrap``, :ref:`ShuffleSplit` and various other - improvements in cross validation schemes by `Olivier Grisel`_ and - `Gael Varoquaux`_ - -- Adjusted Rand index and V-Measure clustering evaluation metrics by `Olivier Grisel`_ - -- Added :class:`Orthogonal Matching Pursuit ` by `Vlad Niculae`_ - -- Added 2D-patch extractor utilities in the :ref:`feature_extraction` module by `Vlad Niculae`_ - -- Implementation of :class:`linear_model.LassoLarsCV` - (cross-validated Lasso solver using the Lars algorithm) and - :class:`linear_model.LassoLarsIC` (BIC/AIC model - selection in Lars) by `Gael Varoquaux`_ - and `Alexandre Gramfort`_ - -- Scalability improvements to :func:`metrics.roc_curve` by Olivier Hervieu - -- Distance helper functions :func:`metrics.pairwise.pairwise_distances` - and :func:`metrics.pairwise.pairwise_kernels` by Robert Layton - -- :class:`Mini-Batch K-Means ` by Nelle Varoquaux and Peter Prettenhofer. - -- :ref:`mldata` utilities by Pietro Berkes. - -- :ref:`olivetti_faces` by `David Warde-Farley`_. - - -API changes summary -------------------- - -Here are the code migration instructions when upgrading from scikit-learn -version 0.8: - -- The ``scikits.learn`` package was renamed ``sklearn``. There is - still a ``scikits.learn`` package alias for backward compatibility. - - Third-party projects with a dependency on scikit-learn 0.9+ should - upgrade their codebase. For instance, under Linux / MacOSX just run - (make a backup first!):: - - find -name "*.py" | xargs sed -i 's/\bscikits.learn\b/sklearn/g' - -- Estimators no longer accept model parameters as ``fit`` arguments: - instead all parameters must be only be passed as constructor - arguments or using the now public ``set_params`` method inherited - from :class:`base.BaseEstimator`. - - Some estimators can still accept keyword arguments on the ``fit`` - but this is restricted to data-dependent values (e.g. a Gram matrix - or an affinity matrix that are precomputed from the ``X`` data matrix. - -- The ``cross_val`` package has been renamed to ``cross_validation`` - although there is also a ``cross_val`` package alias in place for - backward compatibility. - - Third-party projects with a dependency on scikit-learn 0.9+ should - upgrade their codebase. For instance, under Linux / MacOSX just run - (make a backup first!):: - - find -name "*.py" | xargs sed -i 's/\bcross_val\b/cross_validation/g' - -- The ``score_func`` argument of the - ``sklearn.cross_validation.cross_val_score`` function is now expected - to accept ``y_test`` and ``y_predicted`` as only arguments for - classification and regression tasks or ``X_test`` for unsupervised - estimators. - -- ``gamma`` parameter for support vector machine algorithms is set - to ``1 / n_features`` by default, instead of ``1 / n_samples``. - -- The ``sklearn.hmm`` has been marked as orphaned: it will be removed - from scikit-learn in version 0.11 unless someone steps up to - contribute documentation, examples and fix lurking numerical - stability issues. - -- ``sklearn.neighbors`` has been made into a submodule. The two previously - available estimators, ``NeighborsClassifier`` and ``NeighborsRegressor`` - have been marked as deprecated. Their functionality has been divided - among five new classes: ``NearestNeighbors`` for unsupervised neighbors - searches, ``KNeighborsClassifier`` & ``RadiusNeighborsClassifier`` - for supervised classification problems, and ``KNeighborsRegressor`` - & ``RadiusNeighborsRegressor`` for supervised regression problems. - -- ``sklearn.ball_tree.BallTree`` has been moved to - ``sklearn.neighbors.BallTree``. Using the former will generate a warning. - -- ``sklearn.linear_model.LARS()`` and related classes (LassoLARS, - LassoLARSCV, etc.) have been renamed to - ``sklearn.linear_model.Lars()``. - -- All distance metrics and kernels in ``sklearn.metrics.pairwise`` now have a Y - parameter, which by default is None. If not given, the result is the distance - (or kernel similarity) between each sample in Y. If given, the result is the - pairwise distance (or kernel similarity) between samples in X to Y. - -- ``sklearn.metrics.pairwise.l1_distance`` is now called ``manhattan_distance``, - and by default returns the pairwise distance. For the component wise distance, - set the parameter ``sum_over_features`` to ``False``. - -Backward compatibility package aliases and other deprecated classes and -functions will be removed in version 0.11. - - -People ------- - -38 people contributed to this release. - -- 387 `Vlad Niculae`_ -- 320 `Olivier Grisel`_ -- 192 `Lars Buitinck`_ -- 179 `Gael Varoquaux`_ -- 168 `Fabian Pedregosa`_ (`INRIA`_, `Parietal Team`_) -- 127 `Jake Vanderplas`_ -- 120 `Mathieu Blondel`_ -- 85 `Alexandre Passos`_ -- 67 `Alexandre Gramfort`_ -- 57 `Peter Prettenhofer`_ -- 56 `Gilles Louppe`_ -- 42 Robert Layton -- 38 Nelle Varoquaux -- 32 :user:`Jean Kossaifi ` -- 30 Conrad Lee -- 22 Pietro Berkes -- 18 andy -- 17 David Warde-Farley -- 12 Brian Holt -- 11 Robert -- 8 Amit Aides -- 8 :user:`Virgile Fritsch ` -- 7 `Yaroslav Halchenko`_ -- 6 Salvatore Masecchia -- 5 Paolo Losi -- 4 Vincent Schut -- 3 Alexis Metaireau -- 3 Bryan Silverthorn -- 3 `Andreas Müller`_ -- 2 Minwoo Jake Lee -- 1 Emmanuelle Gouillart -- 1 Keith Goodman -- 1 Lucas Wiman -- 1 `Nicolas Pinto`_ -- 1 Thouis (Ray) Jones -- 1 Tim Sheerman-Chase - - -.. _changes_0_8: - -Version 0.8 -=========== - -**May 11, 2011** - -scikit-learn 0.8 was released on May 2011, one month after the first -"international" `scikit-learn coding sprint -`_ and is -marked by the inclusion of important modules: :ref:`hierarchical_clustering`, -:ref:`cross_decomposition`, :ref:`NMF`, initial support for Python 3 and by important -enhancements and bug fixes. - - -Changelog ---------- - -Several new modules where introduced during this release: - -- New :ref:`hierarchical_clustering` module by Vincent Michel, - `Bertrand Thirion`_, `Alexandre Gramfort`_ and `Gael Varoquaux`_. - -- :ref:`kernel_pca` implementation by `Mathieu Blondel`_ - -- :ref:`labeled_faces_in_the_wild` by `Olivier Grisel`_. - -- New :ref:`cross_decomposition` module by `Edouard Duchesnay`_. - -- :ref:`NMF` module `Vlad Niculae`_ - -- Implementation of the :ref:`oracle_approximating_shrinkage` algorithm by - :user:`Virgile Fritsch ` in the :ref:`covariance` module. - - -Some other modules benefited from significant improvements or cleanups. - - -- Initial support for Python 3: builds and imports cleanly, - some modules are usable while others have failing tests by `Fabian Pedregosa`_. - -- :class:`decomposition.PCA` is now usable from the Pipeline object by `Olivier Grisel`_. - -- Guide :ref:`performance-howto` by `Olivier Grisel`_. - -- Fixes for memory leaks in libsvm bindings, 64-bit safer BallTree by Lars Buitinck. - -- bug and style fixing in :ref:`k_means` algorithm by Jan Schlüter. - -- Add attribute converged to Gaussian Mixture Models by Vincent Schut. - -- Implemented ``transform``, ``predict_log_proba`` in - :class:`discriminant_analysis.LinearDiscriminantAnalysis` By `Mathieu Blondel`_. - -- Refactoring in the :ref:`svm` module and bug fixes by `Fabian Pedregosa`_, - `Gael Varoquaux`_ and Amit Aides. - -- Refactored SGD module (removed code duplication, better variable naming), - added interface for sample weight by `Peter Prettenhofer`_. - -- Wrapped BallTree with Cython by Thouis (Ray) Jones. - -- Added function :func:`svm.l1_min_c` by Paolo Losi. - -- Typos, doc style, etc. by `Yaroslav Halchenko`_, `Gael Varoquaux`_, - `Olivier Grisel`_, Yann Malet, `Nicolas Pinto`_, Lars Buitinck and - `Fabian Pedregosa`_. - - -People -------- - -People that made this release possible preceded by number of commits: - - -- 159 `Olivier Grisel`_ -- 96 `Gael Varoquaux`_ -- 96 `Vlad Niculae`_ -- 94 `Fabian Pedregosa`_ -- 36 `Alexandre Gramfort`_ -- 32 Paolo Losi -- 31 `Edouard Duchesnay`_ -- 30 `Mathieu Blondel`_ -- 25 `Peter Prettenhofer`_ -- 22 `Nicolas Pinto`_ -- 11 :user:`Virgile Fritsch ` - - 7 Lars Buitinck - - 6 Vincent Michel - - 5 `Bertrand Thirion`_ - - 4 Thouis (Ray) Jones - - 4 Vincent Schut - - 3 Jan Schlüter - - 2 Julien Miotte - - 2 `Matthieu Perrot`_ - - 2 Yann Malet - - 2 `Yaroslav Halchenko`_ - - 1 Amit Aides - - 1 `Andreas Müller`_ - - 1 Feth Arezki - - 1 Meng Xinfan - - -.. _changes_0_7: - -Version 0.7 -=========== - -**March 2, 2011** - -scikit-learn 0.7 was released in March 2011, roughly three months -after the 0.6 release. This release is marked by the speed -improvements in existing algorithms like k-Nearest Neighbors and -K-Means algorithm and by the inclusion of an efficient algorithm for -computing the Ridge Generalized Cross Validation solution. Unlike the -preceding release, no new modules where added to this release. - -Changelog ---------- - -- Performance improvements for Gaussian Mixture Model sampling [Jan - Schlüter]. - -- Implementation of efficient leave-one-out cross-validated Ridge in - :class:`linear_model.RidgeCV` [`Mathieu Blondel`_] - -- Better handling of collinearity and early stopping in - :func:`linear_model.lars_path` [`Alexandre Gramfort`_ and `Fabian - Pedregosa`_]. - -- Fixes for liblinear ordering of labels and sign of coefficients - [Dan Yamins, Paolo Losi, `Mathieu Blondel`_ and `Fabian Pedregosa`_]. - -- Performance improvements for Nearest Neighbors algorithm in - high-dimensional spaces [`Fabian Pedregosa`_]. - -- Performance improvements for :class:`cluster.KMeans` [`Gael - Varoquaux`_ and `James Bergstra`_]. - -- Sanity checks for SVM-based classes [`Mathieu Blondel`_]. - -- Refactoring of :class:`neighbors.NeighborsClassifier` and - :func:`neighbors.kneighbors_graph`: added different algorithms for - the k-Nearest Neighbor Search and implemented a more stable - algorithm for finding barycenter weights. Also added some - developer documentation for this module, see - `notes_neighbors - `_ for more information [`Fabian Pedregosa`_]. - -- Documentation improvements: Added :class:`pca.RandomizedPCA` and - :class:`linear_model.LogisticRegression` to the class - reference. Also added references of matrices used for clustering - and other fixes [`Gael Varoquaux`_, `Fabian Pedregosa`_, `Mathieu - Blondel`_, `Olivier Grisel`_, Virgile Fritsch , Emmanuelle - Gouillart] - -- Binded decision_function in classes that make use of liblinear_, - dense and sparse variants, like :class:`svm.LinearSVC` or - :class:`linear_model.LogisticRegression` [`Fabian Pedregosa`_]. - -- Performance and API improvements to - :func:`metrics.euclidean_distances` and to - :class:`pca.RandomizedPCA` [`James Bergstra`_]. - -- Fix compilation issues under NetBSD [Kamel Ibn Hassen Derouiche] - -- Allow input sequences of different lengths in :class:`hmm.GaussianHMM` - [`Ron Weiss`_]. - -- Fix bug in affinity propagation caused by incorrect indexing [Xinfan Meng] - - -People ------- - -People that made this release possible preceded by number of commits: - -- 85 `Fabian Pedregosa`_ -- 67 `Mathieu Blondel`_ -- 20 `Alexandre Gramfort`_ -- 19 `James Bergstra`_ -- 14 Dan Yamins -- 13 `Olivier Grisel`_ -- 12 `Gael Varoquaux`_ -- 4 `Edouard Duchesnay`_ -- 4 `Ron Weiss`_ -- 2 Satrajit Ghosh -- 2 Vincent Dubourg -- 1 Emmanuelle Gouillart -- 1 Kamel Ibn Hassen Derouiche -- 1 Paolo Losi -- 1 VirgileFritsch -- 1 `Yaroslav Halchenko`_ -- 1 Xinfan Meng - - -.. _changes_0_6: - -Version 0.6 -=========== - -**December 21, 2010** - -scikit-learn 0.6 was released on December 2010. It is marked by the -inclusion of several new modules and a general renaming of old -ones. It is also marked by the inclusion of new example, including -applications to real-world datasets. - - -Changelog ---------- - -- New `stochastic gradient - `_ descent - module by Peter Prettenhofer. The module comes with complete - documentation and examples. - -- Improved svm module: memory consumption has been reduced by 50%, - heuristic to automatically set class weights, possibility to - assign weights to samples (see - :ref:`sphx_glr_auto_examples_svm_plot_weighted_samples.py` for an example). - -- New :ref:`gaussian_process` module by Vincent Dubourg. This module - also has great documentation and some very neat examples. See - example_gaussian_process_plot_gp_regression.py or - example_gaussian_process_plot_gp_probabilistic_classification_after_regression.py - for a taste of what can be done. - -- It is now possible to use liblinear’s Multi-class SVC (option - multi_class in :class:`svm.LinearSVC`) - -- New features and performance improvements of text feature - extraction. - -- Improved sparse matrix support, both in main classes - (:class:`grid_search.GridSearchCV`) as in modules - sklearn.svm.sparse and sklearn.linear_model.sparse. - -- Lots of cool new examples and a new section that uses real-world - datasets was created. These include: - :ref:`sphx_glr_auto_examples_applications_plot_face_recognition.py`, - :ref:`sphx_glr_auto_examples_applications_plot_species_distribution_modeling.py`, - :ref:`sphx_glr_auto_examples_applications_svm_gui.py`, - :ref:`sphx_glr_auto_examples_applications_wikipedia_principal_eigenvector.py` and - others. - -- Faster :ref:`least_angle_regression` algorithm. It is now 2x - faster than the R version on worst case and up to 10x times faster - on some cases. - -- Faster coordinate descent algorithm. In particular, the full path - version of lasso (:func:`linear_model.lasso_path`) is more than - 200x times faster than before. - -- It is now possible to get probability estimates from a - :class:`linear_model.LogisticRegression` model. - -- module renaming: the glm module has been renamed to linear_model, - the gmm module has been included into the more general mixture - model and the sgd module has been included in linear_model. - -- Lots of bug fixes and documentation improvements. - - -People ------- - -People that made this release possible preceded by number of commits: - - * 207 `Olivier Grisel`_ - - * 167 `Fabian Pedregosa`_ - - * 97 `Peter Prettenhofer`_ - - * 68 `Alexandre Gramfort`_ - - * 59 `Mathieu Blondel`_ - - * 55 `Gael Varoquaux`_ - - * 33 Vincent Dubourg - - * 21 `Ron Weiss`_ - - * 9 Bertrand Thirion - - * 3 `Alexandre Passos`_ - - * 3 Anne-Laure Fouque - - * 2 Ronan Amicel - - * 1 `Christian Osendorfer`_ - - - -.. _changes_0_5: - - -Version 0.5 -=========== - -**October 11, 2010** - -Changelog ---------- - -New classes ------------ - -- Support for sparse matrices in some classifiers of modules - ``svm`` and ``linear_model`` (see :class:`svm.sparse.SVC`, - :class:`svm.sparse.SVR`, :class:`svm.sparse.LinearSVC`, - :class:`linear_model.sparse.Lasso`, :class:`linear_model.sparse.ElasticNet`) - -- New :class:`pipeline.Pipeline` object to compose different estimators. - -- Recursive Feature Elimination routines in module - :ref:`feature_selection`. - -- Addition of various classes capable of cross validation in the - linear_model module (:class:`linear_model.LassoCV`, :class:`linear_model.ElasticNetCV`, - etc.). - -- New, more efficient LARS algorithm implementation. The Lasso - variant of the algorithm is also implemented. See - :class:`linear_model.lars_path`, :class:`linear_model.Lars` and - :class:`linear_model.LassoLars`. - -- New Hidden Markov Models module (see classes - :class:`hmm.GaussianHMM`, :class:`hmm.MultinomialHMM`, - :class:`hmm.GMMHMM`) - -- New module feature_extraction (see :ref:`class reference - `) - -- New FastICA algorithm in module sklearn.fastica - - -Documentation -------------- - -- Improved documentation for many modules, now separating - narrative documentation from the class reference. As an example, - see `documentation for the SVM module - `_ and the - complete `class reference - `_. - -Fixes ------ - -- API changes: adhere variable names to PEP-8, give more - meaningful names. - -- Fixes for svm module to run on a shared memory context - (multiprocessing). - -- It is again possible to generate latex (and thus PDF) from the - sphinx docs. - -Examples --------- - -- new examples using some of the mlcomp datasets: - ``sphx_glr_auto_examples_mlcomp_sparse_document_classification.py`` (since removed) and - :ref:`sphx_glr_auto_examples_text_document_classification_20newsgroups.py` - -- Many more examples. `See here - `_ - the full list of examples. - - -External dependencies ---------------------- - -- Joblib is now a dependency of this package, although it is - shipped with (sklearn.externals.joblib). - -Removed modules ---------------- - -- Module ann (Artificial Neural Networks) has been removed from - the distribution. Users wanting this sort of algorithms should - take a look into pybrain. - -Misc ----- - -- New sphinx theme for the web page. - - -Authors -------- - -The following is a list of authors for this release, preceded by -number of commits: - - * 262 Fabian Pedregosa - * 240 Gael Varoquaux - * 149 Alexandre Gramfort - * 116 Olivier Grisel - * 40 Vincent Michel - * 38 Ron Weiss - * 23 Matthieu Perrot - * 10 Bertrand Thirion - * 7 Yaroslav Halchenko - * 9 VirgileFritsch - * 6 Edouard Duchesnay - * 4 Mathieu Blondel - * 1 Ariel Rokem - * 1 Matthieu Brucher - -Version 0.4 -=========== - -**August 26, 2010** - -Changelog ---------- - -Major changes in this release include: - -- Coordinate Descent algorithm (Lasso, ElasticNet) refactoring & - speed improvements (roughly 100x times faster). - -- Coordinate Descent Refactoring (and bug fixing) for consistency - with R's package GLMNET. - -- New metrics module. - -- New GMM module contributed by Ron Weiss. - -- Implementation of the LARS algorithm (without Lasso variant for now). - -- feature_selection module redesign. - -- Migration to GIT as version control system. - -- Removal of obsolete attrselect module. - -- Rename of private compiled extensions (added underscore). - -- Removal of legacy unmaintained code. - -- Documentation improvements (both docstring and rst). - -- Improvement of the build system to (optionally) link with MKL. - Also, provide a lite BLAS implementation in case no system-wide BLAS is - found. - -- Lots of new examples. - -- Many, many bug fixes ... - - -Authors -------- - -The committer list for this release is the following (preceded by number -of commits): - - * 143 Fabian Pedregosa - * 35 Alexandre Gramfort - * 34 Olivier Grisel - * 11 Gael Varoquaux - * 5 Yaroslav Halchenko - * 2 Vincent Michel - * 1 Chris Filo Gorgolewski - - -Earlier versions -================ - -Earlier versions included contributions by Fred Mailhot, David Cooke, -David Huard, Dave Morrill, Ed Schofield, Travis Oliphant, Pearu Peterson. - -.. _Olivier Grisel: https://twitter.com/ogrisel - -.. _Gael Varoquaux: http://gael-varoquaux.info - -.. _Alexandre Gramfort: http://alexandre.gramfort.net - -.. _Fabian Pedregosa: http://fa.bianp.net - -.. _Mathieu Blondel: http://www.mblondel.org - -.. _James Bergstra: http://www-etud.iro.umontreal.ca/~bergstrj/ - -.. _liblinear: http://www.csie.ntu.edu.tw/~cjlin/liblinear/ - -.. _Yaroslav Halchenko: http://www.onerussian.com/ - -.. _Vlad Niculae: http://vene.ro - -.. _Edouard Duchesnay: https://sites.google.com/site/duchesnay/home - -.. _Peter Prettenhofer: https://sites.google.com/site/peterprettenhofer/ - -.. _Alexandre Passos: http://atpassos.me - -.. _Nicolas Pinto: https://twitter.com/npinto - -.. _Bertrand Thirion: https://team.inria.fr/parietal/bertrand-thirions-page - -.. _Andreas Müller: http://peekaboo-vision.blogspot.com - -.. _Matthieu Perrot: http://brainvisa.info/biblio/lnao/en/Author/PERROT-M.html - -.. _Jake Vanderplas: http://staff.washington.edu/jakevdp/ - -.. _Gilles Louppe: http://www.montefiore.ulg.ac.be/~glouppe/ - -.. _INRIA: http://www.inria.fr - -.. _Parietal Team: http://parietal.saclay.inria.fr/ - -.. _David Warde-Farley: http://www-etud.iro.umontreal.ca/~wardefar/ - -.. _Brian Holt: http://personal.ee.surrey.ac.uk/Personal/B.Holt - -.. _Satrajit Ghosh: http://www.mit.edu/~satra/ - -.. _Robert Layton: https://twitter.com/robertlayton - -.. _Scott White: https://twitter.com/scottblanc - -.. _David Marek: http://www.davidmarek.cz/ - -.. _Christian Osendorfer: https://osdf.github.io - -.. _Arnaud Joly: http://www.ajoly.org - -.. _Rob Zinkov: http://zinkov.com - -.. _Joel Nothman: http://joelnothman.com - -.. _Nicolas Trésegnie : http://nicolastr.com/ - -.. _Kemal Eren: http://www.kemaleren.com - -.. _Yann Dauphin: http://ynd.github.io/ - -.. _Yannick Schwartz: https://team.inria.fr/parietal/schwarty/ - -.. _Kyle Kastner: http://kastnerkyle.github.io - -.. _Daniel Nouri: http://danielnouri.org - -.. _Manoj Kumar: https://manojbits.wordpress.com - -.. _Luis Pedro Coelho: http://luispedro.org - -.. _Fares Hedyati: http://www.eecs.berkeley.edu/~fareshed - -.. _Antony Lee: https://www.ocf.berkeley.edu/~antonyl/ - -.. _Martin Billinger: http://tnsre.embs.org/author/martinbillinger - -.. _Matteo Visconti di Oleggio Castello: http://www.mvdoc.me - -.. _Trevor Stephens: http://trevorstephens.com/ - -.. _Jan Hendrik Metzen: https://jmetzen.github.io/ - -.. _Will Dawson: http://www.dawsonresearch.com - -.. _Andrew Tulloch: http://tullo.ch/ - -.. _Hanna Wallach: http://dirichlet.net/ - -.. _Yan Yi: http://seowyanyi.org - -.. _Hervé Bredin: http://herve.niderb.fr/ - -.. _Eric Martin: http://www.ericmart.in - -.. _Nicolas Goix: https://perso.telecom-paristech.fr/~goix/ - -.. _Sebastian Raschka: http://sebastianraschka.com - -.. _Brian McFee: https://bmcfee.github.io - -.. _Valentin Stolbunov: http://www.vstolbunov.com - -.. _Jaques Grobler: https://github.com/jaquesgrobler - -.. _Lars Buitinck: https://github.com/larsmans - -.. _Loic Esteve: https://github.com/lesteve - -.. _Noel Dawe: https://github.com/ndawe - -.. _Raghav RV: https://github.com/raghavrv - -.. _Tom Dupre la Tour: https://github.com/TomDLT - -.. _Nelle Varoquaux: https://github.com/nellev - -.. _Bing Tian Dai: https://github.com/btdai - -.. _Dylan Werner-Meier: https://github.com/unautre - -.. _Alyssa Batula: https://github.com/abatula - -.. _Srivatsan Ramesh: https://github.com/srivatsan-ramesh - -.. _Ron Weiss: http://www.ee.columbia.edu/~ronw - -.. _Kathleen Chen: https://github.com/kchen17 - -.. _Vincent Pham: https://github.com/vincentpham1991 - -.. _Denis Engemann: http://denis-engemann.de -.. _Anish Shah: https://github.com/AnishShah - -.. _Neeraj Gangwar: http://neerajgangwar.in -.. _Arthur Mensch: https://amensch.fr - -.. _Ivan Nazarov: https://github.com/ivannz diff --git a/doc/whats_new/_contributors.rst b/doc/whats_new/_contributors.rst index ca0f8ede93afa..89a9dcf40a0e0 100644 --- a/doc/whats_new/_contributors.rst +++ b/doc/whats_new/_contributors.rst @@ -176,4 +176,6 @@ .. _Nicolas Hug: https://github.com/NicolasHug -.. _Guillaume Lemaitre: https://github.com/glemaitre \ No newline at end of file +.. _Guillaume Lemaitre: https://github.com/glemaitre + +.. _Ivan Nazarov: https://github.com/ivannz diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index add4d97f6de09..21437674e0333 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -1486,6 +1486,10 @@ Support for Python 3.3 has been officially dropped. version 0.22 to account better for unscaled features. :issue:`8361` by :user:`Gaurav Dhingra ` and :user:`Ting Neo `. +- |Feature| Added the :class:`svm.SVDD` class for novelty detection based on + soft minimal volume hypersphere around the sample data. + :user:`Ivan Nazarov `. + :mod:`sklearn.tree` ................... From 941ca4b199f75f5c0247838fa66fe4720bf14b4f Mon Sep 17 00:00:00 2001 From: Ivan Date: Sat, 7 Oct 2017 01:47:06 +0300 Subject: [PATCH 06/41] review and sync with #9015 --- doc/modules/svm.rst | 19 +++++---- examples/svm/plot_oneclass_vs_svdd.py | 13 +++--- sklearn/svm/_classes.py | 60 +++++++++++++++++++-------- 3 files changed, 59 insertions(+), 33 deletions(-) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index b2aa26d11bd3e..2a79b9393dee7 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -404,11 +404,11 @@ Tips on Practical Use function can be configured to be almost the same as the :class:`LinearSVC` model. - * **Kernel cache size**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC` and - :class:`NuSVR`, the size of the kernel cache has a strong impact on run - times for larger problems. If you have enough RAM available, it is - recommended to set ``cache_size`` to a higher value than the default of - 200(MB), such as 500(MB) or 1000(MB). + * **Kernel cache size**: For :class:`SVC`, :class:`SVR`, :class:`NuSVC`, + :class:`NuSVR`, :class:`OneClassSVM` and :class:`SVDD` the size of the + kernel cache has a strong impact on run times for larger problems. If + you have enough RAM available, it is recommended to set ``cache_size`` + to a higher value than the default of 200(MB), such as 500(MB) or 1000(MB). * **Setting C**: ``C`` is ``1`` by default and it's a reasonable default @@ -458,9 +458,10 @@ Tips on Practical Use ``probability`` is set to ``True``). This randomness can be controlled with the ``random_state`` parameter. If ``probability`` is set to ``False`` these estimators are not random and ``random_state`` has no effect on the - results. The underlying :class:`OneClassSVM` implementation is similar to - the ones of :class:`SVC` and :class:`NuSVC`. As no probability estimation - is provided for :class:`OneClassSVM`, it is not random. + results. The underlying :class:`OneClassSVM` and :class:`SVDD` + implementation is similar to the ones of :class:`SVC` and :class:`NuSVC`. + As no probability estimation is provided for :class:`OneClassSVM` and + :class:`SVDD`, they are not random. The underlying :class:`LinearSVC` implementation uses a random number generator to select features when fitting the model with a dual coordinate @@ -849,7 +850,7 @@ SVDD ---- Support Vector Data Description (SVDD), proposed by Tax and Duin (2004), -aims at finding a spherically shaped boundary around a data set. Specifially, +aims at finding a spherically shaped boundary around a data set. Specifically, it computes a minimum volume hypersphere containing the most of the data with the number of outliers controlled by the parameter of the model. diff --git a/examples/svm/plot_oneclass_vs_svdd.py b/examples/svm/plot_oneclass_vs_svdd.py index a2d20df63a72a..bc19ad53bdd10 100644 --- a/examples/svm/plot_oneclass_vs_svdd.py +++ b/examples/svm/plot_oneclass_vs_svdd.py @@ -17,9 +17,9 @@ decision functions for non-stationary kernels, e.g. polynomial. This example demonstrates this. -Note, that it is incorrect to say that the SVDD generalizes the One-Class -SVM: these are different models, which just happen to coincide for a -particular family of kernels. +Note that it is incorrect to say that the SVDD is equivalent to the +One-Class SVM: these are different models, which just happen to coincide +for a particular family of kernels. """ import numpy as np import matplotlib.pyplot as plt @@ -82,12 +82,13 @@ zorder=-97, label="learned frontier") s = 40 - b1 = ax.scatter(X_train[:, 0], X_train[:, 1], c='white', s=s) + b1 = ax.scatter(X_train[:, 0], X_train[:, 1], s=s, + c='white', edgecolors='k') b2 = ax.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s) c = ax.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s) ax.axis('tight') - ax.set_xlim((-7, 7)) - ax.set_ylim((-7, 7)) + ax.set_xlim((-6, 6)) + ax.set_ylim((-6, 6)) ax.set_title("%s %s (%d/200, %d/40, %d/40)" % (model_name, kernel_name, n_error_train, diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 2abefe1dd8d7d..a15146ab7d28f 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1527,7 +1527,7 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): Estimate the support of a high-dimensional distribution by finding the maximum margin soft boundary hyperplane separating a data set from the - origin. At most the fraction ``nu`` (``0 < nu <= 1``) of the data + origin. At most a fraction ``nu`` (``0 < nu <= 1``) of the data are permitted to be outliers. The implementation is based on libsvm. @@ -1826,8 +1826,8 @@ class SVDD(BaseLibSVM): """Support Vector Data Description (SVDD) for Unsupervised Outlier Detection. Estimate the support of a high-dimensional distribution by finding the - tightest soft boundary hypersphere around a data set, which permits at - most the fraction ``nu`` (``0 < nu <= 1``) of the data as outliers. + tightest soft hypersphere around a data set, which permits at most a + fraction ``nu`` (``0 < nu <= 1``) of the data as outliers. The implementation is based on libsvm. @@ -1844,7 +1844,7 @@ class SVDD(BaseLibSVM): nu : float, optional An upper bound on the fraction of training errors and a lower bound - of the fraction of support vectors. Should be in the interval (0, 1]. + on the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken. degree : int, optional (default=3) @@ -1877,11 +1877,11 @@ class SVDD(BaseLibSVM): Hard limit on iterations within solver, or -1 for no limit. random_state : int, RandomState instance or None, optional (default=None) - The seed of the pseudo random number generator to use when shuffling - the data. If int, random_state is the seed used by the random number - generator; If RandomState instance, random_state is the random number - generator; If None, the random number generator is the RandomState - instance used by `np.random`. + Ignored. + + .. deprecated:: 0.20 + ``random_state`` has been deprecated in 0.20 and will be removed in + 0.22. Attributes ---------- @@ -1891,18 +1891,24 @@ class SVDD(BaseLibSVM): support_vectors_ : array-like, shape = [nSV, n_features] Support vectors. - dual_coef_ : array, shape = [n_classes-1, n_SV] + dual_coef_ : array, shape = [1, n_SV] Coefficients of the support vectors in the decision function. - coef_ : array, shape = [n_classes-1, n_features] + coef_ : array, shape = [1, n_features] Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. `coef_` is readonly property derived from `dual_coef_` and `support_vectors_` - intercept_ : array, shape = [n_classes-1] - Constants in decision function. + intercept_ : array, shape = [1,] + The constant in the decision function. + + offset_ : float + Offset used to define the decision function from the raw scores. + We have the relation: decision_function = score_samples - offset_. + The offset is the opposite of intercept_ and is provided for + consistency with other outlier detection algorithms. References ---------- @@ -1918,13 +1924,15 @@ class SVDD(BaseLibSVM): def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1, random_state=None): + super(SVDD, self).__init__( - 'svdd_l1', kernel, degree, gamma, coef0, tol, 0., nu, 0., - shrinking, False, cache_size, None, verbose, max_iter, - random_state) + 'svdd_l1', kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, + tol=tol, C=0., nu=nu, epsilon=0., shrinking=shrinking, + probability=False, cache_size=cache_size, class_weight=None, + verbose=verbose, max_iter=max_iter, random_state=random_state) def fit(self, X, y=None, sample_weight=None, **params): - """Detects the soft minimum volume hypersphere around the sample X. + """Learns the soft minimum volume hypersphere around the sample X. Parameters ---------- @@ -1948,6 +1956,7 @@ def fit(self, X, y=None, sample_weight=None, **params): """ super(SVDD, self).fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight, **params) + self.offset_ = -self._intercept_ return self def decision_function(self, X): @@ -1964,9 +1973,24 @@ def decision_function(self, X): X : array-like, shape (n_samples,) Returns the decision function of the samples. """ - dec = self._decision_function(X) + dec = self._decision_function(X).ravel() return dec + def score_samples(self, X): + """Raw scoring function of the samples. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + + Returns + ------- + score_samples : array-like, shape (n_samples,) + Returns the (unshifted) scoring function of the samples. + """ + score_samples = self.decision_function(X) + self.offset_ + return score_samples + def predict(self, X): """ Perform classification on samples in X. From e8cd614a993676514cef839dc081f642354c2ec4 Mon Sep 17 00:00:00 2001 From: Ivan Date: Mon, 9 Oct 2017 12:32:04 +0300 Subject: [PATCH 07/41] temporary ocSVM-test patch --- sklearn/svm/tests/test_svm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index a19285e4b8728..6c3464aa44d11 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -447,7 +447,7 @@ def test_oneclass_and_svdd(): mesh = np.c_[xx.ravel(), yy.ravel()] svdd_df = svdd.decision_function(mesh) - ocsvm_df = ocsvm.decision_function(mesh) + ocsvm_df = ocsvm.decision_function(mesh).ravel() assert_array_almost_equal(svdd_df, ocsvm_df) From 543c4e68ad1f58d1e1d8aaf088ea182f4071707d Mon Sep 17 00:00:00 2001 From: Ivan Date: Tue, 10 Oct 2017 16:09:43 +0300 Subject: [PATCH 08/41] removed 'random_state' from SVDD --- sklearn/svm/_classes.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index a15146ab7d28f..23f27a785de11 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1876,13 +1876,6 @@ class SVDD(BaseLibSVM): max_iter : int, optional (default=-1) Hard limit on iterations within solver, or -1 for no limit. - random_state : int, RandomState instance or None, optional (default=None) - Ignored. - - .. deprecated:: 0.20 - ``random_state`` has been deprecated in 0.20 and will be removed in - 0.22. - Attributes ---------- support_ : array-like, shape = [n_SV] @@ -1923,13 +1916,13 @@ class SVDD(BaseLibSVM): """ def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, - verbose=False, max_iter=-1, random_state=None): + verbose=False, max_iter=-1): super(SVDD, self).__init__( 'svdd_l1', kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=0., nu=nu, epsilon=0., shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, - verbose=verbose, max_iter=max_iter, random_state=random_state) + verbose=verbose, max_iter=max_iter, random_state=None) def fit(self, X, y=None, sample_weight=None, **params): """Learns the soft minimum volume hypersphere around the sample X. From d7511484f5f6b81cf6dccf8644a29acd7e70f392 Mon Sep 17 00:00:00 2001 From: Ivan Date: Tue, 10 Oct 2017 16:59:13 +0300 Subject: [PATCH 09/41] fixed sparse SVDD test --- sklearn/svm/tests/test_sparse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py index 5ffaf8f0af08c..d34275528d877 100644 --- a/sklearn/svm/tests/test_sparse.py +++ b/sklearn/svm/tests/test_sparse.py @@ -348,8 +348,8 @@ def test_sparse_svdd(): kernels = ["linear", "poly", "rbf", "sigmoid"] for dataset in datasets: for kernel in kernels: - clf = svm.SVDD(kernel=kernel, random_state=0) - sp_clf = svm.SVDD(kernel=kernel, random_state=0) + clf = svm.SVDD(kernel=kernel) + sp_clf = svm.SVDD(kernel=kernel) check_svm_model_equal(clf, sp_clf, *dataset) From ba1117366293fe72545f9c7239ac251a96b9f7e9 Mon Sep 17 00:00:00 2001 From: Ivan Date: Wed, 11 Oct 2017 02:33:50 +0300 Subject: [PATCH 10/41] score_samples() test for the SVDD --- sklearn/svm/tests/test_svm.py | 51 ++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 6c3464aa44d11..a11d31cc4d7e5 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -18,7 +18,7 @@ from sklearn.model_selection import train_test_split from sklearn.datasets import make_classification, make_blobs from sklearn.metrics import f1_score -from sklearn.metrics.pairwise import rbf_kernel +from sklearn.metrics.pairwise import rbf_kernel, polynomial_kernel from sklearn.utils import check_random_state from sklearn.utils._testing import ignore_warnings from sklearn.utils.validation import _num_samples @@ -413,6 +413,55 @@ def test_svdd_decision_function(): assert_array_equal((dec_func_outliers > 0).ravel(), y_pred_outliers == 1) +def test_svdd_score_samples(): + # Test the raw sample scores of the SVDD + # Background: the theoretical decision function score of the SVDD is + # d(x) = R - \|\phi(x) - a\|^2 + # = R - \alpha^T Q \alpha / (\nu W)^2 - K(x, x) + # + 2 / (\nu W) \sum_i \alpha_i K(z_i, x) + # = 2 / (\nu W) (-\rho + \sum_i \alpha_i (K(z_i, x) - 0.5 K(x, x))) + # where \rho = 0.5 \nu W (\alpha^T Q \alpha / (\nu W)^2 - R), W is the + # sum of sample weights and \sum_i \alpha_i = \nu W since \alpha is + # feasible. + # In contrast, the current implementation returns a scaled score: + # d(x) = 0.5 (\nu W) (R - \|\phi(x) - a\|^2) + # = -\rho + \sum_i \alpha_i (K(z_i, x) - 0.5 K(x, x)) + # Implicit scaling makes the raw decision function scores of the ocSVM + # and SVDD identical when the models coincide (stationary kernel). + + # Generate train data + rnd = check_random_state(2) + X = 0.3 * rnd.randn(100, 2) + X_train = np.r_[X + 2, X - 2] + + # Evaluate the scores on a small uniform 2-d mesh + xx, yy = np.meshgrid(np.linspace(-5, 5, num=26), + np.linspace(-5, 5, num=26)) + X_test = np.c_[xx.ravel(), yy.ravel()] + + # Fit the model for at least 10% support vectors + clf = svm.SVDD(nu=0.1, kernel="poly", degree=2, coef0=1.0) + clf.fit(X_train) + + # Check score_samples() implementation + assert_array_almost_equal(clf.score_samples(X_test), + clf.decision_function(X_test) + clf.offset_) + + # Compute the kernel matrices + k_zx = polynomial_kernel(X_train[clf.support_], X_test, + degree=clf.degree, coef0=clf.coef0) + k_xx = polynomial_kernel(X_test, + degree=clf.degree, coef0=clf.coef0).diagonal() + + # Compute the sample scores = decision scores without `-\rho` + scores_ = np.dot(clf.dual_coef_, k_zx - k_xx[np.newaxis] / 2).ravel() + assert_array_almost_equal(clf.score_samples(X_test), scores_) + + # Get the decision function scores + decision_ = scores_ + clf.intercept_ # intercept_ = - \rho + assert_array_almost_equal(clf.decision_function(X_test), decision_) + + def test_oneclass_and_svdd(): # Generate a sample: two symmetrically placed clusters rnd = check_random_state(2) From 1a7083ab8ac86f866aa078e3c8d34248bf32f3b8 Mon Sep 17 00:00:00 2001 From: Ivan Date: Fri, 23 Feb 2018 23:47:42 +0300 Subject: [PATCH 11/41] BaseLibSVM interface update --- sklearn/svm/_classes.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 23f27a785de11..0d99497ec5058 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1822,7 +1822,7 @@ def _more_tags(self): } -class SVDD(BaseLibSVM): +class SVDD(BaseLibSVM, OutlierMixin): """Support Vector Data Description (SVDD) for Unsupervised Outlier Detection. Estimate the support of a high-dimensional distribution by finding the @@ -1914,12 +1914,15 @@ class SVDD(BaseLibSVM): Report, Department of Computer Science, National Taiwan University. """ + + _impl = 'svdd_l1' + def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1): super(SVDD, self).__init__( - 'svdd_l1', kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, + kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=0., nu=nu, epsilon=0., shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, verbose=verbose, max_iter=max_iter, random_state=None) From 54725055e98c537c49d2440f68a0ff1f36355444 Mon Sep 17 00:00:00 2001 From: ivannz Date: Sun, 29 Jul 2018 10:50:46 +0300 Subject: [PATCH 12/41] FIX: Updated the default gamma to reflect #10331 and tests, fixed the docstring parameter order --- sklearn/svm/_classes.py | 22 ++++++++++++++-------- sklearn/svm/tests/test_sparse.py | 4 ++-- sklearn/svm/tests/test_svm.py | 24 +++++++++++++++--------- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 0d99497ec5058..891b7ac57bb29 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1842,18 +1842,19 @@ class SVDD(BaseLibSVM, OutlierMixin): If none is given, 'rbf' will be used. If a callable is given it is used to precompute the kernel matrix. - nu : float, optional - An upper bound on the fraction of training errors and a lower bound - on the fraction of support vectors. Should be in the interval (0, 1]. - By default 0.5 will be taken. - degree : int, optional (default=3) Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. gamma : float, optional (default='auto') Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - If gamma is 'auto' then 1/n_features will be used instead. + + Current default is 'auto' which uses 1 / n_features, + if ``gamma='scale'`` is passed then it uses 1 / (n_features * X.std()) + as value of gamma. The current default of gamma, 'auto', will change + to 'scale' in version 0.22. 'auto_deprecated', a deprecated version of + 'auto' is used as a default indicating that no explicit value of gamma + was passed. coef0 : float, optional (default=0.0) Independent term in kernel function. @@ -1862,6 +1863,11 @@ class SVDD(BaseLibSVM, OutlierMixin): tol : float, optional Tolerance for stopping criterion. + nu : float, optional + An upper bound on the fraction of training errors and a lower bound + on the fraction of support vectors. Should be in the interval (0, 1]. + By default 0.5 will be taken. + shrinking : boolean, optional Whether to use the shrinking heuristic. @@ -1917,8 +1923,8 @@ class SVDD(BaseLibSVM, OutlierMixin): _impl = 'svdd_l1' - def __init__(self, kernel='rbf', degree=3, gamma='auto', coef0=0.0, - tol=1e-3, nu=0.5, shrinking=True, cache_size=200, + def __init__(self, kernel='rbf', degree=3, gamma='auto_deprecated', + coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1): super(SVDD, self).__init__( diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py index d34275528d877..893cddff71b6c 100644 --- a/sklearn/svm/tests/test_sparse.py +++ b/sklearn/svm/tests/test_sparse.py @@ -348,8 +348,8 @@ def test_sparse_svdd(): kernels = ["linear", "poly", "rbf", "sigmoid"] for dataset in datasets: for kernel in kernels: - clf = svm.SVDD(kernel=kernel) - sp_clf = svm.SVDD(kernel=kernel) + clf = svm.SVDD(gamma='scale', kernel=kernel) + sp_clf = svm.SVDD(gamma='scale', kernel=kernel) check_svm_model_equal(clf, sp_clf, *dataset) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index a11d31cc4d7e5..709b892b10420 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -364,15 +364,15 @@ def test_oneclass_fit_params_is_deprecated(): def test_svdd(): # Test the output of libsvm for the SVDD problem with default parameters - clf = svm.SVDD() + clf = svm.SVDD(gamma='scale') clf.fit(X) pred = clf.predict(T) assert_array_equal(pred, [-1, -1, -1]) assert_equal(pred.dtype, np.dtype('intp')) - assert_array_almost_equal(clf.intercept_, [0.491], decimal=3) + assert_array_almost_equal(clf.intercept_, [0.383], decimal=3) assert_array_almost_equal(clf.dual_coef_, - [[0.632, 0.233, 0.633, 0.234, 0.632, 0.633]], + [[0.681, 0.139, 0.680, 0.140, 0.680, 0.680]], decimal=3) assert_false(hasattr(clf, "coef_")) @@ -397,7 +397,8 @@ def test_svdd_decision_function(): X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2)) # fit the model - clf = svm.SVDD(nu=0.1, kernel="poly", degree=2, coef0=1.0).fit(X_train) + clf = svm.SVDD(gamma='scale', nu=0.1, + kernel="poly", degree=2, coef0=1.0).fit(X_train) # predict and validate things y_pred_test = clf.predict(X_test) @@ -440,17 +441,22 @@ def test_svdd_score_samples(): X_test = np.c_[xx.ravel(), yy.ravel()] # Fit the model for at least 10% support vectors - clf = svm.SVDD(nu=0.1, kernel="poly", degree=2, coef0=1.0) + clf = svm.SVDD(nu=0.1, kernel="poly", gamma='scale', degree=2, coef0=1.0) clf.fit(X_train) # Check score_samples() implementation assert_array_almost_equal(clf.score_samples(X_test), clf.decision_function(X_test) + clf.offset_) + # Test the gamma="scale" + gamma = 1.0 / (X.shape[1] * X_train.std()) + + assert_almost_equal(clf._gamma, gamma) + # Compute the kernel matrices k_zx = polynomial_kernel(X_train[clf.support_], X_test, - degree=clf.degree, coef0=clf.coef0) - k_xx = polynomial_kernel(X_test, + gamma=gamma, degree=clf.degree, coef0=clf.coef0) + k_xx = polynomial_kernel(X_test, gamma=gamma, degree=clf.degree, coef0=clf.coef0).diagonal() # Compute the sample scores = decision scores without `-\rho` @@ -472,10 +478,10 @@ def test_oneclass_and_svdd(): # Test the output of libsvm for the SVDD and the One-Class SVM nu = 0.15 - svdd = svm.SVDD(nu=nu, kernel="rbf") + svdd = svm.SVDD(nu=nu, kernel="rbf", gamma="scale") svdd.fit(X_train) - ocsvm = svm.OneClassSVM(nu=nu, kernel="rbf") + ocsvm = svm.OneClassSVM(nu=nu, kernel="rbf", gamma="scale") ocsvm.fit(X_train) # The intercept of the SVDD differs from that of the One-Class SVM: From a959082cd074dcdbd98d11979183d937fcb21ef9 Mon Sep 17 00:00:00 2001 From: ivannz Date: Fri, 7 Dec 2018 10:54:08 +0300 Subject: [PATCH 13/41] TEST: fixed assertion in test_svm.py to reflect #12717 --- sklearn/svm/tests/test_svm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 709b892b10420..29afb9e69458c 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -374,7 +374,7 @@ def test_svdd(): assert_array_almost_equal(clf.dual_coef_, [[0.681, 0.139, 0.680, 0.140, 0.680, 0.680]], decimal=3) - assert_false(hasattr(clf, "coef_")) + assert not hasattr(clf, "coef_") def test_svdd_decision_function(): From 355c548c7536db2cd12178d9fb56c51c639d3971 Mon Sep 17 00:00:00 2001 From: ivannz Date: Tue, 19 Mar 2019 12:54:58 +0300 Subject: [PATCH 14/41] TST Fixed SVDD tests affected by scale redefinition in #13221 --- sklearn/svm/tests/test_svm.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 29afb9e69458c..5698332dada8c 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -368,11 +368,11 @@ def test_svdd(): clf.fit(X) pred = clf.predict(T) - assert_array_equal(pred, [-1, -1, -1]) + assert_array_equal(pred, [+1, -1, -1]) assert_equal(pred.dtype, np.dtype('intp')) - assert_array_almost_equal(clf.intercept_, [0.383], decimal=3) + assert_array_almost_equal(clf.intercept_, [0.2817], decimal=3) assert_array_almost_equal(clf.dual_coef_, - [[0.681, 0.139, 0.680, 0.140, 0.680, 0.680]], + [[0.7500, 0.7499, 0.7499, 0.7500]], decimal=3) assert not hasattr(clf, "coef_") @@ -405,7 +405,7 @@ def test_svdd_decision_function(): assert_greater(np.mean(y_pred_test == 1), .9) y_pred_outliers = clf.predict(X_outliers) - assert_greater(np.mean(y_pred_outliers == -1), .8) + assert_greater(np.mean(y_pred_outliers == -1), .65) dec_func_test = clf.decision_function(X_test) assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1) @@ -448,8 +448,8 @@ def test_svdd_score_samples(): assert_array_almost_equal(clf.score_samples(X_test), clf.decision_function(X_test) + clf.offset_) - # Test the gamma="scale" - gamma = 1.0 / (X.shape[1] * X_train.std()) + # Test the gamma="scale": use .var() for scaling (c.f. issue #12741) + gamma = 1.0 / (X.shape[1] * X_train.var()) assert_almost_equal(clf._gamma, gamma) From d765a406eed4f1bc05fa288dc72279b9be55fb78 Mon Sep 17 00:00:00 2001 From: Ivan Date: Thu, 25 Jul 2019 11:37:44 +0300 Subject: [PATCH 15/41] updated docstrings and default parameters --- sklearn/svm/_classes.py | 43 ++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 891b7ac57bb29..82f422ce4a5db 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1837,8 +1837,8 @@ class SVDD(BaseLibSVM, OutlierMixin): ---------- kernel : string, optional (default='rbf') Specifies the kernel type to be used in the algorithm. - It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' - or a callable. + It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or + a callable. If none is given, 'rbf' will be used. If a callable is given it is used to precompute the kernel matrix. @@ -1846,15 +1846,15 @@ class SVDD(BaseLibSVM, OutlierMixin): Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. - gamma : float, optional (default='auto') + gamma : {'scale', 'auto'} or float, optional (default='scale') Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - Current default is 'auto' which uses 1 / n_features, - if ``gamma='scale'`` is passed then it uses 1 / (n_features * X.std()) - as value of gamma. The current default of gamma, 'auto', will change - to 'scale' in version 0.22. 'auto_deprecated', a deprecated version of - 'auto' is used as a default indicating that no explicit value of gamma - was passed. + - if ``gamma='scale'`` (default) is passed then it uses + 1 / (n_features * X.var()) as value of gamma, + - if 'auto', uses 1 / n_features. + + .. versionchanged:: 0.22 + The default value of ``gamma`` changed from 'auto' to 'scale'. coef0 : float, optional (default=0.0) Independent term in kernel function. @@ -1864,9 +1864,10 @@ class SVDD(BaseLibSVM, OutlierMixin): Tolerance for stopping criterion. nu : float, optional - An upper bound on the fraction of training errors and a lower bound - on the fraction of support vectors. Should be in the interval (0, 1]. - By default 0.5 will be taken. + An upper bound on the fraction of training + errors and a lower bound of the fraction of support + vectors. Should be in the interval (0, 1]. By default 0.5 + will be taken. shrinking : boolean, optional Whether to use the shrinking heuristic. @@ -1905,10 +1906,20 @@ class SVDD(BaseLibSVM, OutlierMixin): offset_ : float Offset used to define the decision function from the raw scores. - We have the relation: decision_function = score_samples - offset_. - The offset is the opposite of intercept_ and is provided for + We have the relation: decision_function = score_samples - `offset_`. + The offset is the opposite of `intercept_` and is provided for consistency with other outlier detection algorithms. + Examples + -------- + >>> from sklearn.svm import SVDD + >>> X = [[0], [0.44], [0.45], [0.46], [1]] + >>> clf = OneClassSVM(gamma='auto').fit(X) + >>> clf.predict(X) + array([-1, 1, 1, 1, -1]) + >>> clf.score_samples(X) # doctest: +ELLIPSIS + array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...]) + References ---------- .. [1] Tax, D.M. and Duin, R.P., 2004. "Support vector data @@ -1923,11 +1934,11 @@ class SVDD(BaseLibSVM, OutlierMixin): _impl = 'svdd_l1' - def __init__(self, kernel='rbf', degree=3, gamma='auto_deprecated', + def __init__(self, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1): - super(SVDD, self).__init__( + super().__init__( kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, tol=tol, C=0., nu=nu, epsilon=0., shrinking=shrinking, probability=False, cache_size=cache_size, class_weight=None, From bba31b608902937228d19662cd3a3a7625311263 Mon Sep 17 00:00:00 2001 From: Ivan Date: Thu, 25 Jul 2019 12:01:50 +0300 Subject: [PATCH 16/41] TST fixed legacy asserts _eq and _gt in SVDD realted tests --- sklearn/svm/tests/test_svm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 5698332dada8c..111ae55532913 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -369,7 +369,7 @@ def test_svdd(): pred = clf.predict(T) assert_array_equal(pred, [+1, -1, -1]) - assert_equal(pred.dtype, np.dtype('intp')) + assert pred.dtype == np.dtype('intp') assert_array_almost_equal(clf.intercept_, [0.2817], decimal=3) assert_array_almost_equal(clf.dual_coef_, [[0.7500, 0.7499, 0.7499, 0.7500]], @@ -402,10 +402,10 @@ def test_svdd_decision_function(): # predict and validate things y_pred_test = clf.predict(X_test) - assert_greater(np.mean(y_pred_test == 1), .9) + assert np.mean(y_pred_test == 1) > .9 y_pred_outliers = clf.predict(X_outliers) - assert_greater(np.mean(y_pred_outliers == -1), .65) + assert np.mean(y_pred_outliers == -1) > .65 dec_func_test = clf.decision_function(X_test) assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1) From 71ecce1da9728b998973596a8eac1b94d006ed6b Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Sat, 5 Oct 2019 21:07:55 +0300 Subject: [PATCH 17/41] Update MRO in SVDD to satisfy #14884 --- sklearn/svm/_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 82f422ce4a5db..35292c0871433 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1822,7 +1822,7 @@ def _more_tags(self): } -class SVDD(BaseLibSVM, OutlierMixin): +class SVDD(OutlierMixin, BaseLibSVM): """Support Vector Data Description (SVDD) for Unsupervised Outlier Detection. Estimate the support of a high-dimensional distribution by finding the From 8c60b69adc235efea6665bd4936f37cf04c6582e Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Fri, 1 Nov 2019 12:43:29 +0300 Subject: [PATCH 18/41] Simplified super() calls according to #12812 --- sklearn/svm/_classes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 35292c0871433..548d39d54ea94 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1967,8 +1967,8 @@ def fit(self, X, y=None, sample_weight=None, **params): If X is not a C-ordered contiguous array it is copied. """ - super(SVDD, self).fit(X, np.ones(_num_samples(X)), - sample_weight=sample_weight, **params) + super().fit(X, np.ones(_num_samples(X)), + sample_weight=sample_weight, **params) self.offset_ = -self._intercept_ return self @@ -2021,5 +2021,5 @@ def predict(self, X): y_pred : array, shape (n_samples,) Class labels for samples in X. """ - y = super(SVDD, self).predict(X) + y = super().predict(X) return np.asarray(y, dtype=np.intp) From fab45389c0550cbd133cf3e8f7b328422ba16097 Mon Sep 17 00:00:00 2001 From: ivannz Date: Fri, 10 Jan 2020 11:41:46 +0300 Subject: [PATCH 19/41] DOC new docstring guidelines in svdd (according to #16060) --- sklearn/svm/_classes.py | 69 +++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 548d39d54ea94..bbab979848852 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1823,7 +1823,7 @@ def _more_tags(self): class SVDD(OutlierMixin, BaseLibSVM): - """Support Vector Data Description (SVDD) for Unsupervised Outlier Detection. + """Support Vector Data Description for Unsupervised Outlier Detection. Estimate the support of a high-dimensional distribution by finding the tightest soft hypersphere around a data set, which permits at most a @@ -1835,18 +1835,18 @@ class SVDD(OutlierMixin, BaseLibSVM): Parameters ---------- - kernel : string, optional (default='rbf') + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf' Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable. If none is given, 'rbf' will be used. If a callable is given it is used to precompute the kernel matrix. - degree : int, optional (default=3) + degree : int, default=3 Degree of the polynomial kernel function ('poly'). Ignored by all other kernels. - gamma : {'scale', 'auto'} or float, optional (default='scale') + gamma : {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses @@ -1856,52 +1856,52 @@ class SVDD(OutlierMixin, BaseLibSVM): .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'. - coef0 : float, optional (default=0.0) + coef0 : float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. - tol : float, optional + tol : float, default=1e-3 Tolerance for stopping criterion. - nu : float, optional + nu : float, default=0.5 An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]. By default 0.5 will be taken. - shrinking : boolean, optional + shrinking : bool, default=True Whether to use the shrinking heuristic. - cache_size : float, optional + cache_size : float, default=200 Specify the size of the kernel cache (in MB). - verbose : bool, default: False + verbose : bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context. - max_iter : int, optional (default=-1) + max_iter : int, default=-1 Hard limit on iterations within solver, or -1 for no limit. Attributes ---------- - support_ : array-like, shape = [n_SV] + support_ : ndarray of shape (n_SV,) Indices of support vectors. - support_vectors_ : array-like, shape = [nSV, n_features] + support_vectors_ : ndarray of shape (n_SV, n_features) Support vectors. - dual_coef_ : array, shape = [1, n_SV] + dual_coef_ : ndarray of shape (1, n_SV) Coefficients of the support vectors in the decision function. - coef_ : array, shape = [1, n_features] + coef_ : ndarray of shape (1, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. `coef_` is readonly property derived from `dual_coef_` and `support_vectors_` - intercept_ : array, shape = [1,] + intercept_ : ndarray of shape (1,) The constant in the decision function. offset_ : float @@ -1910,6 +1910,9 @@ class SVDD(OutlierMixin, BaseLibSVM): The offset is the opposite of `intercept_` and is provided for consistency with other outlier detection algorithms. + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) + Examples -------- >>> from sklearn.svm import SVDD @@ -1949,18 +1952,20 @@ def fit(self, X, y=None, sample_weight=None, **params): Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) Set of samples, where n_samples is the number of samples and n_features is the number of features. - sample_weight : array-like, shape (n_samples,) - Per-sample weights. Higher weights force the novelty detector - to put more emphasis on these points. + sample_weight : array-like of shape (n_samples,), default=None + Per-sample weights. Rescale C per sample. Higher weights + force the classifier to put more emphasis on these points. + + y : Ignored + not used, present for API consistency by convention. Returns ------- self : object - Returns self. Notes ----- @@ -1979,11 +1984,12 @@ def decision_function(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - X : array-like, shape (n_samples,) + dec : ndarray of shape (n_samples,) Returns the decision function of the samples. """ dec = self._decision_function(X).ravel() @@ -1994,31 +2000,32 @@ def score_samples(self, X): Parameters ---------- - X : array-like, shape (n_samples, n_features) + X : array-like of shape (n_samples, n_features) + The data matrix. Returns ------- - score_samples : array-like, shape (n_samples,) + score_samples : ndarray of shape (n_samples,) Returns the (unshifted) scoring function of the samples. """ score_samples = self.decision_function(X) + self.offset_ return score_samples def predict(self, X): - """ - Perform classification on samples in X. + """Perform classification on samples in X. - For an one-class model, +1 or -1 is returned. + For a one-class model, +1 or -1 is returned. Parameters ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) + X : {array-like, sparse matrix} of shape (n_samples, n_features) or \ + (n_samples_test, n_samples_train) For kernel="precomputed", the expected shape of X is - [n_samples_test, n_samples_train] + (n_samples_test, n_samples_train). Returns ------- - y_pred : array, shape (n_samples,) + y_pred : ndarray of shape (n_samples,) Class labels for samples in X. """ y = super().predict(X) From 7082a56b73b8e0d289d09b28f7fa3953db90690a Mon Sep 17 00:00:00 2001 From: ivannz Date: Fri, 10 Jan 2020 11:51:09 +0300 Subject: [PATCH 20/41] DOCTEST fixed object and corrected reference scores --- sklearn/svm/_classes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index bbab979848852..a75482a7eb380 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1917,11 +1917,11 @@ class SVDD(OutlierMixin, BaseLibSVM): -------- >>> from sklearn.svm import SVDD >>> X = [[0], [0.44], [0.45], [0.46], [1]] - >>> clf = OneClassSVM(gamma='auto').fit(X) + >>> clf = SVDD(gamma='auto').fit(X) >>> clf.predict(X) array([-1, 1, 1, 1, -1]) >>> clf.score_samples(X) # doctest: +ELLIPSIS - array([1.7798..., 2.0547..., 2.0556..., 2.0561..., 1.7332...]) + array([0.5298..., 0.8047..., 0.8056..., 0.8061..., 0.4832...]) References ---------- From dbbc90bd6eccd7e01e9e1b65aae43662044866c3 Mon Sep 17 00:00:00 2001 From: ivannz Date: Fri, 10 Jan 2020 13:38:21 +0300 Subject: [PATCH 21/41] fixed sphinx warnings due to bad indentation for circleci --- doc/modules/svm.rst | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 2a79b9393dee7..2f2ef8474a776 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -831,17 +831,17 @@ where :math:`+1` indicates an inliner and :math:`-1` an outlier. The parameter :math:`\nu\in(0,1]` determines the fraction of outliers in the training dataset. More technically :math:`\nu` is: - * an upper bound on the fraction of the training points lying outside - the estimated region; - * a lower bound on the fraction of support vectors. + - an upper bound on the fraction of the training points lying outside + the estimated region; + - a lower bound on the fraction of support vectors. .. topic:: References: - * `Estimating the support of a high-dimensional distribution - `_ Schölkopf, - Bernhard, et al. Neural computation 13.7 (2001): 1443-1471. - doi:10.1162/089976601750264965 + * `Estimating the support of a high-dimensional distribution + `_ Schölkopf, + Bernhard, et al. Neural computation 13.7 (2001): 1443-1471. + doi:10.1162/089976601750264965 .. _svm_svdd: @@ -945,14 +945,14 @@ for a particular family of kernels (see :ref:`outlier_detection_ocsvm_vs_svdd`). .. topic:: References: - * `Support vector data description - `_ - Tax, and Duin. Machine learning, 54(1) (2004), pp.45-66. + * `Support vector data description + `_ + Tax, and Duin. Machine learning, 54(1) (2004), pp.45-66. - * `A revisit to support vector data description (SVDD). - `_ Chang, Lee, - and Lin. Technical Report (2013), Dept. of Computer Science, - National Taiwan University. + * `A revisit to support vector data description (SVDD). + `_ Chang, Lee, + and Lin. Technical Report (2013), Dept. of Computer Science, + National Taiwan University. .. _svm_implementation_details: From c13582dce61c1b08059862815f9b7f44ff00bd8e Mon Sep 17 00:00:00 2001 From: ivannz Date: Thu, 30 Jan 2020 23:00:32 +0300 Subject: [PATCH 22/41] fixed unresolved conflict --- sklearn/svm/_libsvm.pyx | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/sklearn/svm/_libsvm.pyx b/sklearn/svm/_libsvm.pyx index 4ca510c9dcdf3..a2b0b7d0a82ff 100644 --- a/sklearn/svm/_libsvm.pyx +++ b/sklearn/svm/_libsvm.pyx @@ -74,15 +74,9 @@ def fit( Y : array, dtype=float64 of shape (n_samples,) target vector -<<<<<<< HEAD - svm_type : {0, 1, 2, 3, 4}, default=0 - Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR - respectively. -======= svm_type : {0, 1, 2, 3, 4, 5}, optional Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR, NuSVR, or SVDD-L1 respectively. 0 by default. ->>>>>>> ENH: nu-SVDD with sample weights, based on Chang, Lee, Lin (2013) kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default="rbf" Kernel to use in the model: linear, polynomial, RBF, sigmoid @@ -614,18 +608,12 @@ def cross_validation( Y : array, dtype=float of shape (n_samples,) target vector -<<<<<<< HEAD n_fold : int32 Number of folds for cross validation. -======= + svm_type : {0, 1, 2, 3, 4, 5} Type of SVM: C SVC, nu SVC, one class, epsilon SVR, nu SVR, or SVDD-L1. ->>>>>>> ENH: nu-SVDD with sample weights, based on Chang, Lee, Lin (2013) - - svm_type : {0, 1, 2, 3, 4}, default=0 - Type of SVM: C_SVC, NuSVC, OneClassSVM, EpsilonSVR or NuSVR - respectively. kernel : {'linear', 'rbf', 'poly', 'sigmoid', 'precomputed'}, default='rbf' Kernel to use in the model: linear, polynomial, RBF, sigmoid From 90085e045e4280f86c53bca22a31fac4bf408192 Mon Sep 17 00:00:00 2001 From: ivannz Date: Thu, 30 Jan 2020 23:01:24 +0300 Subject: [PATCH 23/41] fixed unused kwarg warning --- examples/svm/plot_oneclass_vs_svdd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/svm/plot_oneclass_vs_svdd.py b/examples/svm/plot_oneclass_vs_svdd.py index bc19ad53bdd10..9f9b31e16fc71 100644 --- a/examples/svm/plot_oneclass_vs_svdd.py +++ b/examples/svm/plot_oneclass_vs_svdd.py @@ -79,7 +79,7 @@ ax.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred', zorder=-98) a = ax.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred', - zorder=-97, label="learned frontier") + zorder=-97) s = 40 b1 = ax.scatter(X_train[:, 0], X_train[:, 1], s=s, From 84a1dcbda838168acd0ca0471be6bc375d1835b9 Mon Sep 17 00:00:00 2001 From: ivannz Date: Thu, 30 Jan 2020 23:08:36 +0300 Subject: [PATCH 24/41] some oneliners --- sklearn/svm/_classes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index a75482a7eb380..9b7aa17d68ace 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1992,8 +1992,7 @@ def decision_function(self, X): dec : ndarray of shape (n_samples,) Returns the decision function of the samples. """ - dec = self._decision_function(X).ravel() - return dec + return self._decision_function(X).ravel() def score_samples(self, X): """Raw scoring function of the samples. @@ -2008,8 +2007,7 @@ def score_samples(self, X): score_samples : ndarray of shape (n_samples,) Returns the (unshifted) scoring function of the samples. """ - score_samples = self.decision_function(X) + self.offset_ - return score_samples + return self.decision_function(X) + self.offset_ def predict(self, X): """Perform classification on samples in X. From 3d39584240c978c4fe1d809ae2000fdaaa5b0d53 Mon Sep 17 00:00:00 2001 From: ivannz Date: Thu, 30 Jan 2020 23:27:12 +0300 Subject: [PATCH 25/41] side-by-side comparison of scsvm with svdd (stationary kernel) --- examples/miscellaneous/plot_anomaly_comparison.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/miscellaneous/plot_anomaly_comparison.py b/examples/miscellaneous/plot_anomaly_comparison.py index efb4f6d86edfc..3f42cd8e54c2c 100644 --- a/examples/miscellaneous/plot_anomaly_comparison.py +++ b/examples/miscellaneous/plot_anomaly_comparison.py @@ -108,6 +108,7 @@ ), ), ), + ("SVDD", svm.SVDD(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ( "Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42), From 4d7217d64c23c5cf7760e29e80e58cf0bf8fbf4a Mon Sep 17 00:00:00 2001 From: ivannz Date: Fri, 31 Jan 2020 00:04:09 +0300 Subject: [PATCH 26/41] moved SVDD announcement from v0.20 to v0.23 --- doc/whats_new/v0.20.rst | 4 ---- doc/whats_new/v0.23.rst | 4 ++++ sklearn/svm/_classes.py | 5 ++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 21437674e0333..add4d97f6de09 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -1486,10 +1486,6 @@ Support for Python 3.3 has been officially dropped. version 0.22 to account better for unscaled features. :issue:`8361` by :user:`Gaurav Dhingra ` and :user:`Ting Neo `. -- |Feature| Added the :class:`svm.SVDD` class for novelty detection based on - soft minimal volume hypersphere around the sample data. - :user:`Ivan Nazarov `. - :mod:`sklearn.tree` ................... diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index ebf63eac5b8a3..4fef8e3e0a90c 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -721,6 +721,10 @@ Changelog `probB_`, are now deprecated as they were not useful. :pr:`15558` by `Thomas Fan`_. +- |Feature| Added the :class:`svm.SVDD` class for novelty detection based on + soft minimal volume hypersphere around the sample data. + :user:`Ivan Nazarov `. + :mod:`sklearn.tree` ................... diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 9b7aa17d68ace..afe29f23ba5b0 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1833,6 +1833,8 @@ class SVDD(OutlierMixin, BaseLibSVM): Read more in the :ref:`User Guide `. + ..versionadded: 0.23 + Parameters ---------- kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf' @@ -1853,9 +1855,6 @@ class SVDD(OutlierMixin, BaseLibSVM): 1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features. - .. versionchanged:: 0.22 - The default value of ``gamma`` changed from 'auto' to 'scale'. - coef0 : float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'. From 2654479e215ac15b44fd59181c4d8626c8b432b9 Mon Sep 17 00:00:00 2001 From: ivannz Date: Fri, 31 Jan 2020 00:30:05 +0300 Subject: [PATCH 27/41] removed hardcoded sample sizes --- examples/svm/plot_oneclass_vs_svdd.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/svm/plot_oneclass_vs_svdd.py b/examples/svm/plot_oneclass_vs_svdd.py index 9f9b31e16fc71..7353a1d09aae3 100644 --- a/examples/svm/plot_oneclass_vs_svdd.py +++ b/examples/svm/plot_oneclass_vs_svdd.py @@ -90,9 +90,11 @@ ax.set_xlim((-6, 6)) ax.set_ylim((-6, 6)) - ax.set_title("%s %s (%d/200, %d/40, %d/40)" - % (model_name, kernel_name, n_error_train, - n_error_test, n_error_outliers)) + ax.set_title("%s %s (%d/%d, %d/%d, %d/%d)" + % (model_name, kernel_name, + n_error_train, len(X_train), + n_error_test, len(X_test), + n_error_outliers, len(X_outliers))) ax.legend([a.collections[0], b1, b2, c], ["learned frontier", "training observations", From 1e54626decf7e541532596ec3e57c099ab69ab3c Mon Sep 17 00:00:00 2001 From: ivannz Date: Sat, 30 May 2020 18:29:25 +0300 Subject: [PATCH 28/41] patches to svdd-l1 reflecting #14286, #16530, #16992 and #16973 --- sklearn/svm/_classes.py | 4 ++-- sklearn/svm/src/libsvm/svm.cpp | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index afe29f23ba5b0..ea452574a3f7a 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1919,7 +1919,7 @@ class SVDD(OutlierMixin, BaseLibSVM): >>> clf = SVDD(gamma='auto').fit(X) >>> clf.predict(X) array([-1, 1, 1, 1, -1]) - >>> clf.score_samples(X) # doctest: +ELLIPSIS + >>> clf.score_samples(X) array([0.5298..., 0.8047..., 0.8056..., 0.8061..., 0.4832...]) References @@ -1936,7 +1936,7 @@ class SVDD(OutlierMixin, BaseLibSVM): _impl = 'svdd_l1' - def __init__(self, kernel='rbf', degree=3, gamma='scale', + def __init__(self, *, kernel='rbf', degree=3, gamma='scale', coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, verbose=False, max_iter=-1): diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp index 8becae88ece14..21bec8bd93d7d 100644 --- a/sklearn/svm/src/libsvm/svm.cpp +++ b/sklearn/svm/src/libsvm/svm.cpp @@ -1840,14 +1840,14 @@ static void solve_nu_svr( static void solve_svdd_l1( const PREFIX(problem) *prob, const svm_parameter *param, - double *alpha, Solver::SolutionInfo* si) + double *alpha, Solver::SolutionInfo* si, BlasFunctions *blas_functions) { int l = prob->l; int i, j; double r_square; - ONE_CLASS_Q Q = ONE_CLASS_Q(*prob, *param); + ONE_CLASS_Q Q = ONE_CLASS_Q(*prob, *param, blas_functions); if(param->nu < 1) { // case \nu < 1: the dual problem is @@ -1990,7 +1990,7 @@ static decision_function svm_train_one( break; case SVDD_L1: si.upper_bound = Malloc(double,prob->l); - solve_svdd_l1(prob,param,alpha,&si); + solve_svdd_l1(prob,param,alpha,&si,blas_functions); break; } @@ -2945,7 +2945,7 @@ double PREFIX(predict_values)(const PREFIX(model) *model, const PREFIX(node) *x, if(model->param.svm_type == SVDD_L1) { - double K_xx = NAMESPACE::Kernel::k_function(x,x,model->param) / 2; + double K_xx = NAMESPACE::Kernel::k_function(x,x,model->param,blas_functions) / 2; for(int i=0;il;i++) sum -= sv_coef[i] * K_xx; } @@ -3266,7 +3266,8 @@ const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_param if(svm_type == C_SVC || svm_type == EPSILON_SVR || svm_type == NU_SVR || - svm_type == ONE_CLASS) + svm_type == ONE_CLASS || + svm_type == SVDD_L1) { PREFIX(problem) newprob; // filter samples with negative and null weights From 7a0ede0df3fc7919cbd40842798288394df5833d Mon Sep 17 00:00:00 2001 From: ivannz Date: Sun, 5 Jul 2020 14:39:44 +0300 Subject: [PATCH 29/41] reflect #17176: zero weight in SV models means that a sample is never a support vector --- sklearn/svm/_classes.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index ea452574a3f7a..e6413a45100f3 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -2027,3 +2027,11 @@ def predict(self, X): """ y = super().predict(X) return np.asarray(y, dtype=np.intp) + + def _more_tags(self): + return { + '_xfail_checks': { + 'check_sample_weights_invariance(kind=zeros)': + 'zero sample_weight is not equivalent to removing samples', + } + } From 6bf0fb549e33b40433ffe2e95c8c0b3612454c89 Mon Sep 17 00:00:00 2001 From: ivannz Date: Sun, 5 Jul 2020 14:42:34 +0300 Subject: [PATCH 30/41] reflect #15521: document attribtues inherited by SVDD from BaseLibSVM --- sklearn/svm/_classes.py | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index e6413a45100f3..b87bb35b5fd88 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1870,6 +1870,7 @@ class SVDD(OutlierMixin, BaseLibSVM): shrinking : bool, default=True Whether to use the shrinking heuristic. + See the :ref:`User Guide `. cache_size : float, default=200 Specify the size of the kernel cache (in MB). @@ -1884,33 +1885,43 @@ class SVDD(OutlierMixin, BaseLibSVM): Attributes ---------- - support_ : ndarray of shape (n_SV,) - Indices of support vectors. - - support_vectors_ : ndarray of shape (n_SV, n_features) - Support vectors. - - dual_coef_ : ndarray of shape (1, n_SV) - Coefficients of the support vectors in the decision function. + class_weight_ : ndarray of shape (n_classes,) + Multipliers of parameter C for each class. + Computed based on the ``class_weight`` parameter. coef_ : ndarray of shape (1, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. `coef_` is readonly property derived from `dual_coef_` and - `support_vectors_` + `support_vectors_`. + + dual_coef_ : ndarray of shape (1, n_SV) + Coefficients of the support vectors in the decision function. + + fit_status_ : int + 0 if correctly fitted, 1 otherwise (will raise warning) intercept_ : ndarray of shape (1,) The constant in the decision function. + n_support_ : ndarray of shape (n_classes,), dtype=int32 + Number of support vectors for each class. + offset_ : float Offset used to define the decision function from the raw scores. We have the relation: decision_function = score_samples - `offset_`. The offset is the opposite of `intercept_` and is provided for consistency with other outlier detection algorithms. - fit_status_ : int - 0 if correctly fitted, 1 otherwise (will raise warning) + shape_fit_ : tuple of int of shape (n_dimensions_of_X,) + Array dimensions of training vector ``X``. + + support_ : ndarray of shape (n_SV,) + Indices of support vectors. + + support_vectors_ : ndarray of shape (n_SV, n_features) + Support vectors. Examples -------- From 00f32796881b3205d78f234054a3020d205ceac7 Mon Sep 17 00:00:00 2001 From: ivannz Date: Sun, 5 Jul 2020 14:52:19 +0300 Subject: [PATCH 31/41] reflect #14286: test for negative or null sample_weights in SVDD --- sklearn/svm/tests/test_svm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 111ae55532913..30a453ef8f4ca 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -715,8 +715,9 @@ def test_svm_equivalence_sample_weight_C(): (svm.SVR, "Invalid input - all samples have zero or negative weights."), (svm.NuSVR, "Invalid input - all samples have zero or negative weights."), (svm.OneClassSVM, "Invalid input - all samples have zero or negative weights."), + (svm.SVDD, "Invalid input - all samples have zero or negative weights."), ], - ids=["SVC", "NuSVC", "SVR", "NuSVR", "OneClassSVM"], + ids=["SVC", "NuSVC", "SVR", "NuSVR", "OneClassSVM", "SVDD"], ) @pytest.mark.parametrize( "sample_weight", From 0e015e0d8aa9c703dc0980fd78fb2aa00b758188 Mon Sep 17 00:00:00 2001 From: ivannz Date: Thu, 15 Oct 2020 15:16:12 +0300 Subject: [PATCH 32/41] update mode in more-tags (reflecting #17361) see-also cross-reference in ocSVM and SVDD (reflecting #18332) --- sklearn/svm/_classes.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index b87bb35b5fd88..3908d67356fb5 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1654,6 +1654,10 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): Local Outlier Factor (LOF). sklearn.ensemble.IsolationForest : Isolation Forest Algorithm. + sklearn.svm.SVDD : Support vector method for outlier detection via + a separating soft-margin hypesphere implemented with libsvm with + a parameter to control the number of support vectors. + Examples -------- >>> from sklearn.svm import OneClassSVM @@ -1933,6 +1937,12 @@ class SVDD(OutlierMixin, BaseLibSVM): >>> clf.score_samples(X) array([0.5298..., 0.8047..., 0.8056..., 0.8061..., 0.4832...]) + See Also + -------- + OneClassSVM : Support vector method for outlier detection via a separating + soft-margin hyperplane implemented with libsvm with a parameter to + control the number of support vectors. + References ---------- .. [1] Tax, D.M. and Duin, R.P., 2004. "Support vector data @@ -2042,7 +2052,7 @@ def predict(self, X): def _more_tags(self): return { '_xfail_checks': { - 'check_sample_weights_invariance(kind=zeros)': + 'check_sample_weights_invariance': 'zero sample_weight is not equivalent to removing samples', } } From 6bb003fb171f1c36275f1d9650d1a2069e8eac81 Mon Sep 17 00:00:00 2001 From: ivannz Date: Thu, 25 Feb 2021 23:55:41 +0300 Subject: [PATCH 33/41] moved SVDD announcement from v0.23 to v1.0 --- doc/whats_new/v0.23.rst | 4 ---- doc/whats_new/v1.0.rst | 7 +++++++ sklearn/svm/_classes.py | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst index 4fef8e3e0a90c..ebf63eac5b8a3 100644 --- a/doc/whats_new/v0.23.rst +++ b/doc/whats_new/v0.23.rst @@ -721,10 +721,6 @@ Changelog `probB_`, are now deprecated as they were not useful. :pr:`15558` by `Thomas Fan`_. -- |Feature| Added the :class:`svm.SVDD` class for novelty detection based on - soft minimal volume hypersphere around the sample data. - :user:`Ivan Nazarov `. - :mod:`sklearn.tree` ................... diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index 6ece2f16b6e93..b47ebc9123c23 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -1213,6 +1213,13 @@ Changelog now deprecated. Use `scipy.sparse.csgraph.shortest_path` instead. :pr:`20531` by `Tom Dupre la Tour`_. +:mod:`sklearn.svm` +.................. + +- |Feature| Added the :class:`svm.SVDD` class for novelty detection based + on soft minimal volume hypersphere around the sample data. :pr:`7910` + by :user:`Ivan Nazarov `. + Code and Documentation Contributors ----------------------------------- diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 3908d67356fb5..988d01d542a71 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1837,7 +1837,7 @@ class SVDD(OutlierMixin, BaseLibSVM): Read more in the :ref:`User Guide `. - ..versionadded: 0.23 + ..versionadded: 1.0 Parameters ---------- From fd436052a15437e6ea2936e123faf22c76780a3e Mon Sep 17 00:00:00 2001 From: ivannz Date: Tue, 15 Jun 2021 14:48:46 +0300 Subject: [PATCH 34/41] docfix in SVDD related to #20236 --- sklearn/svm/_classes.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 988d01d542a71..a114899f7693e 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1909,6 +1909,11 @@ class SVDD(OutlierMixin, BaseLibSVM): intercept_ : ndarray of shape (1,) The constant in the decision function. + n_features_in_ : int + Number of features seen during :term:`fit`. + + .. versionadded:: 0.24 + n_support_ : ndarray of shape (n_classes,), dtype=int32 Number of support vectors for each class. From b0f4926d40aa43b1663b7333cb616a022ca7e7b2 Mon Sep 17 00:00:00 2001 From: ivannz Date: Fri, 23 Jul 2021 12:49:26 +0300 Subject: [PATCH 35/41] migrate svdd code style to Black (#18948) ensure SVDD passes numpydoc validation (#20463) check for svdd in `test_sparse.py:check_svm_model_equal` to avoid calling `.predict_proba` --- sklearn/svm/__init__.py | 3 +- sklearn/svm/_base.py | 3 +- sklearn/svm/_classes.py | 80 +++++++++++++++++++++----------- sklearn/svm/tests/test_sparse.py | 18 ++++--- sklearn/svm/tests/test_svm.py | 43 ++++++++--------- 5 files changed, 88 insertions(+), 59 deletions(-) diff --git a/sklearn/svm/__init__.py b/sklearn/svm/__init__.py index 34976e71e797a..fad79458656d1 100644 --- a/sklearn/svm/__init__.py +++ b/sklearn/svm/__init__.py @@ -10,8 +10,7 @@ # of their respective owners. # License: BSD 3 clause (C) INRIA 2010 -from ._classes import SVC, NuSVC, SVR, NuSVR, OneClassSVM, LinearSVC, \ - LinearSVR, SVDD +from ._classes import SVC, NuSVC, SVR, NuSVR, OneClassSVM, LinearSVC, LinearSVR, SVDD from ._bounds import l1_min_c __all__ = [ diff --git a/sklearn/svm/_base.py b/sklearn/svm/_base.py index c18589d9a14bc..52ac82797afb9 100644 --- a/sklearn/svm/_base.py +++ b/sklearn/svm/_base.py @@ -27,8 +27,7 @@ from ..exceptions import NotFittedError -LIBSVM_IMPL = ['c_svc', 'nu_svc', 'one_class', 'epsilon_svr', 'nu_svr', - 'svdd_l1'] +LIBSVM_IMPL = ["c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr", "svdd_l1"] def _one_vs_one_coef(dual_coef, n_support, support_vectors): diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index a114899f7693e..4747854001999 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1932,16 +1932,6 @@ class SVDD(OutlierMixin, BaseLibSVM): support_vectors_ : ndarray of shape (n_SV, n_features) Support vectors. - Examples - -------- - >>> from sklearn.svm import SVDD - >>> X = [[0], [0.44], [0.45], [0.46], [1]] - >>> clf = SVDD(gamma='auto').fit(X) - >>> clf.predict(X) - array([-1, 1, 1, 1, -1]) - >>> clf.score_samples(X) - array([0.5298..., 0.8047..., 0.8056..., 0.8061..., 0.4832...]) - See Also -------- OneClassSVM : Support vector method for outlier detection via a separating @@ -1958,22 +1948,55 @@ class SVDD(OutlierMixin, BaseLibSVM): to support vector data description (SVDD)." Technical Report, Department of Computer Science, National Taiwan University. + + Examples + -------- + >>> from sklearn.svm import SVDD + >>> X = [[0], [0.44], [0.45], [0.46], [1]] + >>> clf = SVDD(gamma='auto').fit(X) + >>> clf.predict(X) + array([-1, 1, 1, 1, -1]) + >>> clf.score_samples(X) + array([0.5298..., 0.8047..., 0.8056..., 0.8061..., 0.4832...]) """ - _impl = 'svdd_l1' + _impl = "svdd_l1" - def __init__(self, *, kernel='rbf', degree=3, gamma='scale', - coef0=0.0, tol=1e-3, nu=0.5, shrinking=True, cache_size=200, - verbose=False, max_iter=-1): + def __init__( + self, + *, + kernel="rbf", + degree=3, + gamma="scale", + coef0=0.0, + tol=1e-3, + nu=0.5, + shrinking=True, + cache_size=200, + verbose=False, + max_iter=-1, + ): super().__init__( - kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, - tol=tol, C=0., nu=nu, epsilon=0., shrinking=shrinking, - probability=False, cache_size=cache_size, class_weight=None, - verbose=verbose, max_iter=max_iter, random_state=None) + kernel=kernel, + degree=degree, + gamma=gamma, + coef0=coef0, + tol=tol, + C=0.0, + nu=nu, + epsilon=0.0, + shrinking=shrinking, + probability=False, + cache_size=cache_size, + class_weight=None, + verbose=verbose, + max_iter=max_iter, + random_state=None, + ) def fit(self, X, y=None, sample_weight=None, **params): - """Learns the soft minimum volume hypersphere around the sample X. + """Learn a soft minimum-volume hypersphere around the sample X. Parameters ---------- @@ -1981,24 +2004,26 @@ def fit(self, X, y=None, sample_weight=None, **params): Set of samples, where n_samples is the number of samples and n_features is the number of features. + y : Ignored + Not used, present for API consistency by convention. + sample_weight : array-like of shape (n_samples,), default=None Per-sample weights. Rescale C per sample. Higher weights force the classifier to put more emphasis on these points. - y : Ignored - not used, present for API consistency by convention. + **params : dict + Additional fit parameters. Returns ------- self : object + Fitted estimator. Notes ----- If X is not a C-ordered contiguous array it is copied. - """ - super().fit(X, np.ones(_num_samples(X)), - sample_weight=sample_weight, **params) + super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight, **params) self.offset_ = -self._intercept_ return self @@ -2056,8 +2081,9 @@ def predict(self, X): def _more_tags(self): return { - '_xfail_checks': { - 'check_sample_weights_invariance': - 'zero sample_weight is not equivalent to removing samples', + "_xfail_checks": { + "check_sample_weights_invariance": ( + "zero sample_weight is not equivalent to removing samples" + ), } } diff --git a/sklearn/svm/tests/test_sparse.py b/sklearn/svm/tests/test_sparse.py index 893cddff71b6c..0ab99d557125c 100644 --- a/sklearn/svm/tests/test_sparse.py +++ b/sklearn/svm/tests/test_sparse.py @@ -74,6 +74,8 @@ def check_svm_model_equal(dense_svm, sparse_svm, X_train, y_train, X_test): ) if isinstance(dense_svm, svm.OneClassSVM): msg = "cannot use sparse input in 'OneClassSVM' trained on dense data" + elif isinstance(dense_svm, svm.SVDD): + msg = "cannot use sparse input in 'SVDD' trained on dense data" else: assert_array_almost_equal( dense_svm.predict_proba(X_test_dense), sparse_svm.predict_proba(X_test), 4 @@ -336,20 +338,22 @@ def test_sparse_oneclasssvm(datasets_index, kernel): def test_sparse_svdd(): - """Check that sparse SVDD gives the same result as dense SVDD - """ + """Check that sparse SVDD gives the same result as dense SVDD""" # many class dataset: X_blobs, _ = make_blobs(n_samples=100, centers=10, random_state=0) X_blobs = sparse.csr_matrix(X_blobs) - datasets = [[X_sp, None, T], [X2_sp, None, T2], - [X_blobs[:80], None, X_blobs[80:]], - [iris.data, None, iris.data]] + datasets = [ + [X_sp, None, T], + [X2_sp, None, T2], + [X_blobs[:80], None, X_blobs[80:]], + [iris.data, None, iris.data], + ] kernels = ["linear", "poly", "rbf", "sigmoid"] for dataset in datasets: for kernel in kernels: - clf = svm.SVDD(gamma='scale', kernel=kernel) - sp_clf = svm.SVDD(gamma='scale', kernel=kernel) + clf = svm.SVDD(gamma="scale", kernel=kernel) + sp_clf = svm.SVDD(gamma="scale", kernel=kernel) check_svm_model_equal(clf, sp_clf, *dataset) diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 30a453ef8f4ca..83fc4bf379e01 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -364,16 +364,16 @@ def test_oneclass_fit_params_is_deprecated(): def test_svdd(): # Test the output of libsvm for the SVDD problem with default parameters - clf = svm.SVDD(gamma='scale') + clf = svm.SVDD(gamma="scale") clf.fit(X) pred = clf.predict(T) assert_array_equal(pred, [+1, -1, -1]) - assert pred.dtype == np.dtype('intp') + assert pred.dtype == np.dtype("intp") assert_array_almost_equal(clf.intercept_, [0.2817], decimal=3) - assert_array_almost_equal(clf.dual_coef_, - [[0.7500, 0.7499, 0.7499, 0.7500]], - decimal=3) + assert_array_almost_equal( + clf.dual_coef_, [[0.7500, 0.7499, 0.7499, 0.7500]], decimal=3 + ) assert not hasattr(clf, "coef_") @@ -397,15 +397,15 @@ def test_svdd_decision_function(): X_outliers = rnd.uniform(low=-4, high=4, size=(20, 2)) # fit the model - clf = svm.SVDD(gamma='scale', nu=0.1, - kernel="poly", degree=2, coef0=1.0).fit(X_train) + clf = svm.SVDD(gamma="scale", nu=0.1, kernel="poly", degree=2, coef0=1.0) + clf.fit(X_train) # predict and validate things y_pred_test = clf.predict(X_test) - assert np.mean(y_pred_test == 1) > .9 + assert np.mean(y_pred_test == 1) > 0.9 y_pred_outliers = clf.predict(X_outliers) - assert np.mean(y_pred_outliers == -1) > .65 + assert np.mean(y_pred_outliers == -1) > 0.65 dec_func_test = clf.decision_function(X_test) assert_array_equal((dec_func_test > 0).ravel(), y_pred_test == 1) @@ -436,17 +436,17 @@ def test_svdd_score_samples(): X_train = np.r_[X + 2, X - 2] # Evaluate the scores on a small uniform 2-d mesh - xx, yy = np.meshgrid(np.linspace(-5, 5, num=26), - np.linspace(-5, 5, num=26)) + xx, yy = np.meshgrid(np.linspace(-5, 5, num=26), np.linspace(-5, 5, num=26)) X_test = np.c_[xx.ravel(), yy.ravel()] # Fit the model for at least 10% support vectors - clf = svm.SVDD(nu=0.1, kernel="poly", gamma='scale', degree=2, coef0=1.0) + clf = svm.SVDD(nu=0.1, kernel="poly", gamma="scale", degree=2, coef0=1.0) clf.fit(X_train) # Check score_samples() implementation - assert_array_almost_equal(clf.score_samples(X_test), - clf.decision_function(X_test) + clf.offset_) + assert_array_almost_equal( + clf.score_samples(X_test), clf.decision_function(X_test) + clf.offset_ + ) # Test the gamma="scale": use .var() for scaling (c.f. issue #12741) gamma = 1.0 / (X.shape[1] * X_train.var()) @@ -454,10 +454,12 @@ def test_svdd_score_samples(): assert_almost_equal(clf._gamma, gamma) # Compute the kernel matrices - k_zx = polynomial_kernel(X_train[clf.support_], X_test, - gamma=gamma, degree=clf.degree, coef0=clf.coef0) - k_xx = polynomial_kernel(X_test, gamma=gamma, - degree=clf.degree, coef0=clf.coef0).diagonal() + k_zx = polynomial_kernel( + X_train[clf.support_], X_test, gamma=gamma, degree=clf.degree, coef0=clf.coef0 + ) + k_xx = polynomial_kernel( + X_test, gamma=gamma, degree=clf.degree, coef0=clf.coef0 + ).diagonal() # Compute the sample scores = decision scores without `-\rho` scores_ = np.dot(clf.dual_coef_, k_zx - k_xx[np.newaxis] / 2).ravel() @@ -497,8 +499,7 @@ def test_oneclass_and_svdd(): assert_array_almost_equal(svdd.intercept_, svdd_intercept, decimal=3) # Evaluate the decision function on a uniformly spaced 2-d mesh - xx, yy = np.meshgrid(np.linspace(-5, 5, num=101), - np.linspace(-5, 5, num=101)) + xx, yy = np.meshgrid(np.linspace(-5, 5, num=101), np.linspace(-5, 5, num=101)) mesh = np.c_[xx.ravel(), yy.ravel()] svdd_df = svdd.decision_function(mesh) @@ -1114,7 +1115,7 @@ def test_immutable_coef_property(): svm.SVR(kernel="linear").fit(iris.data, iris.target), svm.NuSVR(kernel="linear").fit(iris.data, iris.target), svm.OneClassSVM(kernel="linear").fit(iris.data), - svm.SVDD(kernel='linear').fit(iris.data), + svm.SVDD(kernel="linear").fit(iris.data), ] for clf in svms: with pytest.raises(AttributeError): From 742954a50185fd8102fc6c2591c7c219b709003b Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Wed, 10 Nov 2021 16:48:21 +0300 Subject: [PATCH 36/41] update version in svdd docs to 1.1, relocate from 1.0 to 1.1 in whats_new add backticks (#20914), deprecate **params in fit (#20843), add feature_names_in_ (#20787) uncompromisingly reformat plot_oneclass_vs_svdd with black --- doc/whats_new/v1.0.rst | 7 --- doc/whats_new/v1.1.rst | 4 ++ examples/svm/plot_oneclass_vs_svdd.py | 80 ++++++++++++++++----------- sklearn/svm/_classes.py | 15 +++-- 4 files changed, 64 insertions(+), 42 deletions(-) diff --git a/doc/whats_new/v1.0.rst b/doc/whats_new/v1.0.rst index b47ebc9123c23..6ece2f16b6e93 100644 --- a/doc/whats_new/v1.0.rst +++ b/doc/whats_new/v1.0.rst @@ -1213,13 +1213,6 @@ Changelog now deprecated. Use `scipy.sparse.csgraph.shortest_path` instead. :pr:`20531` by `Tom Dupre la Tour`_. -:mod:`sklearn.svm` -.................. - -- |Feature| Added the :class:`svm.SVDD` class for novelty detection based - on soft minimal volume hypersphere around the sample data. :pr:`7910` - by :user:`Ivan Nazarov `. - Code and Documentation Contributors ----------------------------------- diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index 952d2867360a3..d4f18f0c06c26 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -1234,6 +1234,10 @@ Changelog parameters in `fit` instead of `__init__`. :pr:`21436` by :user:`Haidar Almubarak `. +- |Feature| Added the :class:`svm.SVDD` class for novelty detection based + on soft minimal volume hypersphere around the sample data. :pr:`7910` + by :user:`Ivan Nazarov `. + :mod:`sklearn.tree` ................... diff --git a/examples/svm/plot_oneclass_vs_svdd.py b/examples/svm/plot_oneclass_vs_svdd.py index 7353a1d09aae3..6c57b018b27eb 100644 --- a/examples/svm/plot_oneclass_vs_svdd.py +++ b/examples/svm/plot_oneclass_vs_svdd.py @@ -41,22 +41,26 @@ X_outliers = random_state.uniform(low=-4, high=4, size=(20, 2)) # Define the models -nu = .1 -kernels = [("RBF", dict(kernel="rbf", gamma=0.1)), - ("Poly", dict(kernel="poly", degree=2, coef0=1.0)), - ] +nu = 0.1 +kernels = [ + ("RBF", dict(kernel="rbf", gamma=0.1)), + ("Poly", dict(kernel="poly", degree=2, coef0=1.0)), +] for kernel_name, kernel in kernels: # Use low tolerance to ensure better precision of the SVM # optimization procedure. - classifiers = [("OCSVM", svm.OneClassSVM(nu=nu, tol=1e-8, **kernel)), - ("SVDD", svm.SVDD(nu=nu, tol=1e-8, **kernel)), - ] + classifiers = [ + ("OCSVM", svm.OneClassSVM(nu=nu, tol=1e-8, **kernel)), + ("SVDD", svm.SVDD(nu=nu, tol=1e-8, **kernel)), + ] fig = plt.figure(figsize=(12, 5)) - fig.suptitle("One-Class SVM versus SVDD " - "(error train, error novel regular, error novel abnormal)") + fig.suptitle( + "One-Class SVM versus SVDD " + "(error train, error novel regular, error novel abnormal)" + ) for i, (model_name, clf) in enumerate(classifiers): clf.fit(X_train) @@ -74,32 +78,46 @@ Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) - ax.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), - cmap=plt.cm.PuBu, zorder=-99) - ax.contourf(xx, yy, Z, levels=[0, Z.max()], colors='palevioletred', - zorder=-98) - a = ax.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred', - zorder=-97) + ax.contourf( + xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu, zorder=-99 + ) + ax.contourf(xx, yy, Z, levels=[0, Z.max()], colors="palevioletred", zorder=-98) + a = ax.contour( + xx, yy, Z, levels=[0], linewidths=2, colors="darkred", zorder=-97 + ) s = 40 - b1 = ax.scatter(X_train[:, 0], X_train[:, 1], s=s, - c='white', edgecolors='k') - b2 = ax.scatter(X_test[:, 0], X_test[:, 1], c='blueviolet', s=s) - c = ax.scatter(X_outliers[:, 0], X_outliers[:, 1], c='gold', s=s) - ax.axis('tight') + b1 = ax.scatter(X_train[:, 0], X_train[:, 1], s=s, c="white", edgecolors="k") + b2 = ax.scatter(X_test[:, 0], X_test[:, 1], c="blueviolet", s=s) + c = ax.scatter(X_outliers[:, 0], X_outliers[:, 1], c="gold", s=s) + ax.axis("tight") ax.set_xlim((-6, 6)) ax.set_ylim((-6, 6)) - ax.set_title("%s %s (%d/%d, %d/%d, %d/%d)" - % (model_name, kernel_name, - n_error_train, len(X_train), - n_error_test, len(X_test), - n_error_outliers, len(X_outliers))) - - ax.legend([a.collections[0], b1, b2, c], - ["learned frontier", "training observations", - "new regular observations", "new abnormal observations"], - loc="lower right", - prop=matplotlib.font_manager.FontProperties(size=10)) + ax.set_title( + "%s %s (%d/%d, %d/%d, %d/%d)" + % ( + model_name, + kernel_name, + n_error_train, + len(X_train), + n_error_test, + len(X_test), + n_error_outliers, + len(X_outliers), + ) + ) + + ax.legend( + [a.collections[0], b1, b2, c], + [ + "learned frontier", + "training observations", + "new regular observations", + "new abnormal observations", + ], + loc="lower right", + prop=matplotlib.font_manager.FontProperties(size=10), + ) plt.show() diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 4747854001999..3143c5aa76e79 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1837,7 +1837,7 @@ class SVDD(OutlierMixin, BaseLibSVM): Read more in the :ref:`User Guide `. - ..versionadded: 1.0 + ..versionadded: 1.1 Parameters ---------- @@ -1912,7 +1912,9 @@ class SVDD(OutlierMixin, BaseLibSVM): n_features_in_ : int Number of features seen during :term:`fit`. - .. versionadded:: 0.24 + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. n_support_ : ndarray of shape (n_classes,), dtype=int32 Number of support vectors for each class. @@ -2001,8 +2003,8 @@ def fit(self, X, y=None, sample_weight=None, **params): Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) - Set of samples, where n_samples is the number of samples and - n_features is the number of features. + Set of samples, where `n_samples` is the number of samples and + `n_features` is the number of features. y : Ignored Not used, present for API consistency by convention. @@ -2014,6 +2016,11 @@ def fit(self, X, y=None, sample_weight=None, **params): **params : dict Additional fit parameters. + .. deprecated:: 1.0 + The `fit` method will not longer accept extra keyword + parameters in 1.2. These keyword parameters were + already discarded. + Returns ------- self : object From 9c95eeab8ca1d12b6fc6a1c13ccad461fbc54f9e Mon Sep 17 00:00:00 2001 From: ivannz Date: Sun, 15 May 2022 11:07:47 +0300 Subject: [PATCH 37/41] move feature announcement from 1.1 to 1.2 --- doc/whats_new/v1.1.rst | 4 ---- doc/whats_new/v1.2.rst | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst index d4f18f0c06c26..952d2867360a3 100644 --- a/doc/whats_new/v1.1.rst +++ b/doc/whats_new/v1.1.rst @@ -1234,10 +1234,6 @@ Changelog parameters in `fit` instead of `__init__`. :pr:`21436` by :user:`Haidar Almubarak `. -- |Feature| Added the :class:`svm.SVDD` class for novelty detection based - on soft minimal volume hypersphere around the sample data. :pr:`7910` - by :user:`Ivan Nazarov `. - :mod:`sklearn.tree` ................... diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst index d1ab9c8ed1b36..a2bd9f73422f2 100644 --- a/doc/whats_new/v1.2.rst +++ b/doc/whats_new/v1.2.rst @@ -327,6 +327,10 @@ Changelog :class:`svm.NuSVR`, :class:`svm.SVR`, :class:`svm.OneClassSVM`. :pr:`22898` by :user:`Meekail Zain `. +- |Feature| Added the :class:`svm.SVDD` class for novelty detection based + on soft minimal volume hypersphere around the sample data. :pr:`7910` + by :user:`Ivan Nazarov `. + :mod:`sklearn.tree` ................... From 36778b4e9042ae6c7bdd4d5e66d2de39fc851d7d Mon Sep 17 00:00:00 2001 From: ivannz Date: Sun, 15 May 2022 11:25:17 +0300 Subject: [PATCH 38/41] fixed user guide ref in SVDD docstring, copied kernel parameter docs from ocSVM, and bumped versionadded; added SVDD to tests which involved ocSVM --- sklearn/svm/_classes.py | 47 ++++++++++++++++++----------------- sklearn/svm/tests/test_svm.py | 11 ++++---- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 3143c5aa76e79..3a24d17cd63a4 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1653,7 +1653,6 @@ class OneClassSVM(OutlierMixin, BaseLibSVM): sklearn.neighbors.LocalOutlierFactor : Unsupervised Outlier Detection using Local Outlier Factor (LOF). sklearn.ensemble.IsolationForest : Isolation Forest Algorithm. - sklearn.svm.SVDD : Support vector method for outlier detection via a separating soft-margin hypesphere implemented with libsvm with a parameter to control the number of support vectors. @@ -1830,21 +1829,20 @@ class SVDD(OutlierMixin, BaseLibSVM): """Support Vector Data Description for Unsupervised Outlier Detection. Estimate the support of a high-dimensional distribution by finding the - tightest soft hypersphere around a data set, which permits at most a - fraction ``nu`` (``0 < nu <= 1``) of the data as outliers. + tightest soft boundary hypersphere around a data set, which permits at + most a fraction ``nu`` (``0 < nu <= 1``) of the data as outliers. The implementation is based on libsvm. - Read more in the :ref:`User Guide `. + Read more in the :ref:`User Guide `. - ..versionadded: 1.1 + ..versionadded: 1.2 Parameters ---------- - kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}, default='rbf' + kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, \ + default='rbf' Specifies the kernel type to be used in the algorithm. - It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or - a callable. If none is given, 'rbf' will be used. If a callable is given it is used to precompute the kernel matrix. @@ -1916,6 +1914,9 @@ class SVDD(OutlierMixin, BaseLibSVM): Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. + n_iter_ : int + Number of iterations run by the optimization routine to fit the model. + n_support_ : ndarray of shape (n_classes,), dtype=int32 Number of support vectors for each class. @@ -1980,20 +1981,20 @@ def __init__( ): super().__init__( - kernel=kernel, - degree=degree, - gamma=gamma, - coef0=coef0, - tol=tol, - C=0.0, - nu=nu, - epsilon=0.0, - shrinking=shrinking, - probability=False, - cache_size=cache_size, - class_weight=None, - verbose=verbose, - max_iter=max_iter, + kernel, + degree, + gamma, + coef0, + tol, + 0.0, + nu, + 0.0, + shrinking, + False, + cache_size, + None, + verbose, + max_iter, random_state=None, ) @@ -2030,7 +2031,7 @@ def fit(self, X, y=None, sample_weight=None, **params): ----- If X is not a C-ordered contiguous array it is copied. """ - super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight, **params) + super().fit(X, np.ones(_num_samples(X)), sample_weight=sample_weight) self.offset_ = -self._intercept_ return self diff --git a/sklearn/svm/tests/test_svm.py b/sklearn/svm/tests/test_svm.py index 83fc4bf379e01..d8a760bedc3ed 100644 --- a/sklearn/svm/tests/test_svm.py +++ b/sklearn/svm/tests/test_svm.py @@ -14,7 +14,7 @@ from numpy.testing import assert_allclose from scipy import sparse from sklearn import svm, linear_model, datasets, metrics, base -from sklearn.svm import LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR +from sklearn.svm import LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR, SVDD from sklearn.model_selection import train_test_split from sklearn.datasets import make_classification, make_blobs from sklearn.metrics import f1_score @@ -1441,9 +1441,9 @@ def test_linearsvm_liblinear_sample_weight(SVM, params): assert_allclose(X_est_no_weight, X_est_with_weight) -@pytest.mark.parametrize("Klass", (OneClassSVM, SVR, NuSVR)) +@pytest.mark.parametrize("Klass", (OneClassSVM, SVR, NuSVR, SVDD)) def test_n_support(Klass): - # Make n_support is correct for oneclass and SVR (used to be + # Make sure n_support is correct for oneclass, SVDD and SVR (used to be # non-initialized) # this is a non regression test for issue #14774 X = np.array([[0], [0.44], [0.45], [0.46], [1]]) @@ -1514,6 +1514,7 @@ def test_svc_raises_error_internal_representation(): (svm.SVR, int), (svm.NuSVR, int), (svm.OneClassSVM, int), + (svm.SVDD, int), ], ) @pytest.mark.parametrize( @@ -1527,8 +1528,8 @@ def test_svc_raises_error_internal_representation(): def test_n_iter_libsvm(estimator, expected_n_iter_type, dataset): # Check that the type of n_iter_ is correct for the classes that inherit # from BaseSVC. - # Note that for SVC, and NuSVC this is an ndarray; while for SVR, NuSVR, and - # OneClassSVM, it is an int. + # Note that for SVC, and NuSVC this is an ndarray; while for SVR, NuSVR, + # SVDD and OneClassSVM, it is an int. # For SVC and NuSVC also check the shape of n_iter_. X, y = dataset n_iter = estimator(kernel="linear").fit(X, y).n_iter_ From 4e5ca417b0b2b58f478e989a5392c10f28a24426 Mon Sep 17 00:00:00 2001 From: ivannz Date: Tue, 14 Jun 2022 20:13:52 +0300 Subject: [PATCH 39/41] Removed deprecated `class_weight_` from the docs of SVDD (related to #22898) --- sklearn/svm/_classes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 3a24d17cd63a4..6ed99418b9d46 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1887,10 +1887,6 @@ class SVDD(OutlierMixin, BaseLibSVM): Attributes ---------- - class_weight_ : ndarray of shape (n_classes,) - Multipliers of parameter C for each class. - Computed based on the ``class_weight`` parameter. - coef_ : ndarray of shape (1, n_features) Weights assigned to the features (coefficients in the primal problem). This is only available in the case of a linear kernel. From 3cc3610d2d1600ee44ba61feafaebece5c22ebc4 Mon Sep 17 00:00:00 2001 From: ivannz Date: Tue, 30 Aug 2022 00:16:55 +0300 Subject: [PATCH 40/41] add parameter validation to SVDD and update dunder-docs (similar to ocSVM #24001) finish v1.2 deprecation of params kwargs in `.fit` of SVDD (similar to ocSVM #20843) TST ensure SVDD passes param-validation test_common.py due to #23462 (#22722) --- sklearn/svm/_classes.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/sklearn/svm/_classes.py b/sklearn/svm/_classes.py index 6ed99418b9d46..420c932edd419 100644 --- a/sklearn/svm/_classes.py +++ b/sklearn/svm/_classes.py @@ -1848,7 +1848,7 @@ class SVDD(OutlierMixin, BaseLibSVM): degree : int, default=3 Degree of the polynomial kernel function ('poly'). - Ignored by all other kernels. + Must be non-negative. Ignored by all other kernels. gamma : {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. @@ -1856,6 +1856,7 @@ class SVDD(OutlierMixin, BaseLibSVM): - if ``gamma='scale'`` (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features. + - if float, must be non-negative. coef0 : float, default=0.0 Independent term in kernel function. @@ -1933,9 +1934,9 @@ class SVDD(OutlierMixin, BaseLibSVM): See Also -------- - OneClassSVM : Support vector method for outlier detection via a separating - soft-margin hyperplane implemented with libsvm with a parameter to - control the number of support vectors. + sklearn.svm.OneClassSVM : Support vector method for outlier detection via + a separating soft-margin hyperplane implemented with libsvm with + a parameter to control the number of support vectors. References ---------- @@ -1961,6 +1962,10 @@ class SVDD(OutlierMixin, BaseLibSVM): _impl = "svdd_l1" + _parameter_constraints = {**BaseLibSVM._parameter_constraints} # type: ignore + for unused_param in ["C", "class_weight", "epsilon", "probability", "random_state"]: + _parameter_constraints.pop(unused_param) + def __init__( self, *, @@ -1994,7 +1999,7 @@ def __init__( random_state=None, ) - def fit(self, X, y=None, sample_weight=None, **params): + def fit(self, X, y=None, sample_weight=None): """Learn a soft minimum-volume hypersphere around the sample X. Parameters @@ -2010,14 +2015,6 @@ def fit(self, X, y=None, sample_weight=None, **params): Per-sample weights. Rescale C per sample. Higher weights force the classifier to put more emphasis on these points. - **params : dict - Additional fit parameters. - - .. deprecated:: 1.0 - The `fit` method will not longer accept extra keyword - parameters in 1.2. These keyword parameters were - already discarded. - Returns ------- self : object From 80a172529f7f7a1596a18c8cee1ceab741e0f8d7 Mon Sep 17 00:00:00 2001 From: ivannz Date: Sun, 4 Sep 2022 13:40:02 +0300 Subject: [PATCH 41/41] clarify the parent space of the SVDD hypersphere (#r374672496) add SVDD announcement to svm.cpp, fix stray trailing spaces (#r374671161) --- doc/modules/svm.rst | 7 +- sklearn/svm/src/libsvm/svm.cpp | 199 +++++++++++++++++---------------- 2 files changed, 108 insertions(+), 98 deletions(-) diff --git a/doc/modules/svm.rst b/doc/modules/svm.rst index 2f2ef8474a776..9203f44abfc10 100644 --- a/doc/modules/svm.rst +++ b/doc/modules/svm.rst @@ -851,10 +851,11 @@ SVDD Support Vector Data Description (SVDD), proposed by Tax and Duin (2004), aims at finding a spherically shaped boundary around a data set. Specifically, -it computes a minimum volume hypersphere containing the most of the data with -the number of outliers controlled by the parameter of the model. +it computes a minimum volume hypersphere (in the feature space induced by the +kernel) containing the most of the data with the number of outliers controlled +by the parameter of the model. -The original formulation suffered from non-convexity issues related to optimality of +The original formulation suffered from non-convexity issues related to optimality of the attained solution for certain values of the regularization parameter :math:`C`. Chang, Lee, and Lin (2013) suggested a reformulation of the SVDD model which had a well-defined and provably unique global solution for any :math:`C>0`. diff --git a/sklearn/svm/src/libsvm/svm.cpp b/sklearn/svm/src/libsvm/svm.cpp index 21bec8bd93d7d..5d04e735a002e 100644 --- a/sklearn/svm/src/libsvm/svm.cpp +++ b/sklearn/svm/src/libsvm/svm.cpp @@ -31,7 +31,7 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -/* +/* Modified 2010: - Support for dense data by Ming-Fang Weng @@ -59,6 +59,15 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Exposed number of iterations run in optimization, Juan Martín Loyola. See + + Modified 2022: + + - Implemented the Support Vector Data Description based on the works + by Tax and Duin (2004) and Chang, Lee, and Lin (2013). The model was + extended to support weighted observations and reparameterized to the + fraction of outliers (nu). + Nazarov Ivan + See */ #include @@ -129,7 +138,7 @@ static void info(const char *fmt,...) and dense versions of this library */ #ifdef _DENSE_REP #ifdef PREFIX - #undef PREFIX + #undef PREFIX #endif #ifdef NAMESPACE #undef NAMESPACE @@ -140,7 +149,7 @@ and dense versions of this library */ #else /* sparse representation */ #ifdef PREFIX - #undef PREFIX + #undef PREFIX #endif #ifdef NAMESPACE #undef NAMESPACE @@ -167,7 +176,7 @@ class Cache // return some position p where [p,len) need to be filled // (p >= len if nothing needs to be filled) int get_data(const int index, Qfloat **data, int len); - void swap_index(int i, int j); + void swap_index(int i, int j); private: int l; long int size; @@ -443,7 +452,7 @@ double Kernel::dot(const PREFIX(node) *px, const PREFIX(node) *py, BlasFunctions ++py; else ++px; - } + } } return sum; } @@ -487,7 +496,7 @@ double Kernel::k_function(const PREFIX(node) *x, const PREFIX(node) *y, else { if(x->index > y->index) - { + { sum += y->value * y->value; ++y; } @@ -524,7 +533,7 @@ double Kernel::k_function(const PREFIX(node) *x, const PREFIX(node) *y, #endif } default: - return 0; // Unreachable + return 0; // Unreachable } } // An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918 @@ -602,7 +611,7 @@ class Solver { virtual double calculate_rho(); virtual void do_shrinking(); private: - bool be_shrunk(int i, double Gmax1, double Gmax2); + bool be_shrunk(int i, double Gmax1, double Gmax2); }; void Solver::swap_index(int i, int j) @@ -750,11 +759,11 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_, else counter = 1; // do shrinking next iteration } - + ++iter; // update alpha[i] and alpha[j], handle bounds carefully - + const Qfloat *Q_i = Q.get_Q(i,active_size); const Qfloat *Q_j = Q.get_Q(j,active_size); @@ -773,7 +782,7 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_, double diff = alpha[i] - alpha[j]; alpha[i] += delta; alpha[j] += delta; - + if(diff > 0) { if(alpha[j] < 0) @@ -855,7 +864,7 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_, double delta_alpha_i = alpha[i] - old_alpha_i; double delta_alpha_j = alpha[j] - old_alpha_j; - + for(int k=0;k= Gmax) @@ -990,7 +999,7 @@ int Solver::select_working_set(int &out_i, int &out_j) Gmax2 = G[j]; if (grad_diff > 0) { - double obj_diff; + double obj_diff; double quad_coef = QD[i]+QD[j]-2.0*y[i]*Q_i[j]; if (quad_coef > 0) obj_diff = -(grad_diff*grad_diff)/quad_coef; @@ -1014,7 +1023,7 @@ int Solver::select_working_set(int &out_i, int &out_j) Gmax2 = -G[j]; if (grad_diff > 0) { - double obj_diff; + double obj_diff; double quad_coef = QD[i]+QD[j]+2.0*y[i]*Q_i[j]; if (quad_coef > 0) obj_diff = -(grad_diff*grad_diff)/quad_coef; @@ -1052,7 +1061,7 @@ bool Solver::be_shrunk(int i, double Gmax1, double Gmax2) { if(y[i]==+1) return(G[i] > Gmax2); - else + else return(G[i] > Gmax1); } else @@ -1068,27 +1077,27 @@ void Solver::do_shrinking() // find maximal violating pair first for(i=0;i= Gmax1) Gmax1 = -G[i]; } - if(!is_lower_bound(i)) + if(!is_lower_bound(i)) { if(G[i] >= Gmax2) Gmax2 = G[i]; } } - else + else { - if(!is_upper_bound(i)) + if(!is_upper_bound(i)) { if(-G[i] >= Gmax2) Gmax2 = -G[i]; } - if(!is_lower_bound(i)) + if(!is_lower_bound(i)) { if(G[i] >= Gmax1) Gmax1 = G[i]; @@ -1096,7 +1105,7 @@ void Solver::do_shrinking() } } - if(unshrink == false && Gmax1 + Gmax2 <= eps*10) + if(unshrink == false && Gmax1 + Gmax2 <= eps*10) { unshrink = true; reconstruct_gradient(); @@ -1235,14 +1244,14 @@ int Solver_NU::select_working_set(int &out_i, int &out_j) { if(y[j]==+1) { - if (!is_lower_bound(j)) + if (!is_lower_bound(j)) { double grad_diff=Gmaxp+G[j]; if (G[j] >= Gmaxp2) Gmaxp2 = G[j]; if (grad_diff > 0) { - double obj_diff; + double obj_diff; double quad_coef = QD[ip]+QD[j]-2*Q_ip[j]; if (quad_coef > 0) obj_diff = -(grad_diff*grad_diff)/quad_coef; @@ -1266,7 +1275,7 @@ int Solver_NU::select_working_set(int &out_i, int &out_j) Gmaxn2 = -G[j]; if (grad_diff > 0) { - double obj_diff; + double obj_diff; double quad_coef = QD[in]+QD[j]-2*Q_in[j]; if (quad_coef > 0) obj_diff = -(grad_diff*grad_diff)/quad_coef; @@ -1301,14 +1310,14 @@ bool Solver_NU::be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, doubl { if(y[i]==+1) return(-G[i] > Gmax1); - else + else return(-G[i] > Gmax4); } else if(is_lower_bound(i)) { if(y[i]==+1) return(G[i] > Gmax2); - else + else return(G[i] > Gmax3); } else @@ -1337,14 +1346,14 @@ void Solver_NU::do_shrinking() if(!is_lower_bound(i)) { if(y[i]==+1) - { + { if(G[i] > Gmax2) Gmax2 = G[i]; } else if(G[i] > Gmax3) Gmax3 = G[i]; } } - if(unshrink == false && max(Gmax1+Gmax2,Gmax3+Gmax4) <= eps*10) + if(unshrink == false && max(Gmax1+Gmax2,Gmax3+Gmax4) <= eps*10) { unshrink = true; reconstruct_gradient(); @@ -1407,12 +1416,12 @@ double Solver_NU::calculate_rho() r1 = sum_free1/nr_free1; else r1 = (ub1+lb1)/2; - + if(nr_free2 > 0) r2 = sum_free2/nr_free2; else r2 = (ub2+lb2)/2; - + si->r = (r1+r2)/2; return (r1-r2)/2; } @@ -1421,7 +1430,7 @@ double Solver_NU::calculate_rho() // Q matrices for various formulations // class SVC_Q: public Kernel -{ +{ public: SVC_Q(const PREFIX(problem)& prob, const svm_parameter& param, const schar *y_, BlasFunctions *blas_functions) :Kernel(prob.l, prob.x, param, blas_functions) @@ -1432,7 +1441,7 @@ class SVC_Q: public Kernel for(int i=0;i*kernel_function)(i,i); } - + Qfloat *get_Q(int i, int len) const { Qfloat *data; @@ -1481,7 +1490,7 @@ class ONE_CLASS_Q: public Kernel for(int i=0;i*kernel_function)(i,i); } - + Qfloat *get_Q(int i, int len) const { Qfloat *data; @@ -1517,7 +1526,7 @@ class ONE_CLASS_Q: public Kernel }; class SVR_Q: public Kernel -{ +{ public: SVR_Q(const PREFIX(problem)& prob, const svm_parameter& param, BlasFunctions *blas_functions) :Kernel(prob.l, prob.x, param, blas_functions) @@ -1547,7 +1556,7 @@ class SVR_Q: public Kernel swap(index[i],index[j]); swap(QD[i],QD[j]); } - + Qfloat *get_Q(int i, int len) const { Qfloat *data; @@ -1663,7 +1672,7 @@ static void solve_nu_svc( C[i] = prob->W[i]; } - + double nu_l = 0; for(i=0;iupper_bound[i] /= r; + si->upper_bound[i] /= r; } si->rho /= r; @@ -1956,7 +1965,7 @@ static void solve_svdd_l1( struct decision_function { double *alpha; - double rho; + double rho; int n_iter; }; @@ -1969,23 +1978,23 @@ static decision_function svm_train_one( switch(param->svm_type) { case C_SVC: - si.upper_bound = Malloc(double,prob->l); + si.upper_bound = Malloc(double,prob->l); solve_c_svc(prob,param,alpha,&si,Cp,Cn,blas_functions); break; case NU_SVC: - si.upper_bound = Malloc(double,prob->l); + si.upper_bound = Malloc(double,prob->l); solve_nu_svc(prob,param,alpha,&si,blas_functions); break; case ONE_CLASS: - si.upper_bound = Malloc(double,prob->l); + si.upper_bound = Malloc(double,prob->l); solve_one_class(prob,param,alpha,&si,blas_functions); break; case EPSILON_SVR: - si.upper_bound = Malloc(double,2*prob->l); + si.upper_bound = Malloc(double,2*prob->l); solve_epsilon_svr(prob,param,alpha,&si,blas_functions); break; case NU_SVR: - si.upper_bound = Malloc(double,2*prob->l); + si.upper_bound = Malloc(double,2*prob->l); solve_nu_svr(prob,param,alpha,&si,blas_functions); break; case SVDD_L1: @@ -2033,7 +2042,7 @@ static decision_function svm_train_one( // Platt's binary SVM Probabilistic Output: an improvement from Lin et al. static void sigmoid_train( - int l, const double *dec_values, const double *labels, + int l, const double *dec_values, const double *labels, double& A, double& B) { double prior1=0, prior0 = 0; @@ -2042,7 +2051,7 @@ static void sigmoid_train( for (i=0;i 0) prior1+=1; else prior0+=1; - + int max_iter=100; // Maximal number of iterations double min_step=1e-10; // Minimal step taken in line search double sigma=1e-12; // For numerically strict PD of Hessian @@ -2052,8 +2061,8 @@ static void sigmoid_train( double *t=Malloc(double,l); double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize; double newA,newB,newf,d1,d2; - int iter; - + int iter; + // Initial Point and Initial Fun Value A=0.0; B=log((prior0+1.0)/(prior1+1.0)); double fval = 0.0; @@ -2163,7 +2172,7 @@ static void multiclass_probability(int k, double **r, double *p) double **Q=Malloc(double *,k); double *Qp=Malloc(double,k); double pQp, eps=0.005/k; - + for (t=0;tx+perm[j]),&(dec_values[perm[j]]), blas_functions); + PREFIX(predict_values)(submodel,(prob->x+perm[j]),&(dec_values[perm[j]]), blas_functions); #else - PREFIX(predict_values)(submodel,prob->x[perm[j]],&(dec_values[perm[j]]), blas_functions); + PREFIX(predict_values)(submodel,prob->x[perm[j]],&(dec_values[perm[j]]), blas_functions); #endif // ensure +1 -1 order; reason not using CV subroutine dec_values[perm[j]] *= submodel->label[0]; - } + } PREFIX(free_and_destroy_model)(&submodel); PREFIX(destroy_param)(&subparam); } free(subprob.x); free(subprob.y); free(subprob.W); - } + } sigmoid_train(prob->l,dec_values,prob->y,probA,probB); free(dec_values); free(perm); } -// Return parameter of a Laplace distribution +// Return parameter of a Laplace distribution static double svm_svr_probability( const PREFIX(problem) *prob, const svm_parameter *param, BlasFunctions *blas_functions) { @@ -2336,15 +2345,15 @@ static double svm_svr_probability( { ymv[i]=prob->y[i]-ymv[i]; mae += fabs(ymv[i]); - } + } mae /= prob->l; double std=sqrt(2*mae*mae); int count=0; mae=0; for(i=0;il;i++) - if (fabs(ymv[i]) > 5*std) + if (fabs(ymv[i]) > 5*std) count=count+1; - else + else mae+=fabs(ymv[i]); mae /= (prob->l-count); info("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma= %g\n",mae); @@ -2363,7 +2372,7 @@ static void svm_group_classes(const PREFIX(problem) *prob, int *nr_class_ret, in int nr_class = 0; int *label = Malloc(int,max_nr_class); int *count = Malloc(int,max_nr_class); - int *data_label = Malloc(int,l); + int *data_label = Malloc(int,l); int i, j, this_label, this_count; for(i=0;i 0. // -static void remove_zero_weight(PREFIX(problem) *newprob, const PREFIX(problem) *prob) +static void remove_zero_weight(PREFIX(problem) *newprob, const PREFIX(problem) *prob) { int i; int l = 0; @@ -2503,7 +2512,7 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p model->probA = NULL; model->probB = NULL; model->sv_coef = Malloc(double *,1); - if(param->probability && + if(param->probability && (param->svm_type == EPSILON_SVR || param->svm_type == NU_SVR)) { @@ -2537,7 +2546,7 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p model->sv_ind[j] = i; model->sv_coef[0][j] = f.alpha[i]; ++j; - } + } free(f.alpha); } @@ -2552,7 +2561,7 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p int *perm = Malloc(int,l); // group training data of the same class - NAMESPACE::svm_group_classes(prob,&nr_class,&label,&start,&count,perm); + NAMESPACE::svm_group_classes(prob,&nr_class,&label,&start,&count,perm); #ifdef _DENSE_REP PREFIX(node) *x = Malloc(PREFIX(node),l); #else @@ -2573,7 +2582,7 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p for(i=0;iC; for(i=0;inr_weight;i++) - { + { int j; for(j=0;jweight_label[i] == label[j]) @@ -2585,7 +2594,7 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p } // train k*(k-1)/2 models - + bool *nonzero = Malloc(bool,l); for(i=0;inr_class = nr_class; - + model->label = Malloc(int,nr_class); for(i=0;ilabel[i] = label[i]; - + model->rho = Malloc(double,nr_class*(nr_class-1)/2); model->n_iter = Malloc(int,nr_class*(nr_class-1)/2); for(i=0;iSV[p] = x[i]; model->sv_ind[p] = perm[i]; ++p; @@ -2730,7 +2739,7 @@ PREFIX(model) *PREFIX(train)(const PREFIX(problem) *prob, const svm_parameter *p int sj = start[j]; int ci = count[i]; int cj = count[j]; - + int q = nz_start[i]; int k; for(k=0;ksv_coef[i][q++] = f[p].alpha[ci+k]; ++p; } - + free(label); free(probA); free(probB); @@ -2794,7 +2803,7 @@ void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter * int *index = Malloc(int,l); for(i=0;iprobability && + if(param->probability && (param->svm_type == C_SVC || param->svm_type == NU_SVC)) { double *prob_estimates=Malloc(double, PREFIX(get_nr_class)(submodel)); @@ -2884,7 +2893,7 @@ void PREFIX(cross_validation)(const PREFIX(problem) *prob, const svm_parameter * #else target[perm[j]] = PREFIX(predict_probability)(submodel,prob->x[perm[j]],prob_estimates, blas_functions); #endif - free(prob_estimates); + free(prob_estimates); } else for(j=begin;jnr_class; int l = model->l; - + double *kvalue = Malloc(double,l); for(i=0;inSV[i]; int cj = model->nSV[j]; - + int k; double *coef1 = model->sv_coef[j-1]; double *coef2 = model->sv_coef[i]; @@ -3035,7 +3044,7 @@ double PREFIX(predict)(const PREFIX(model) *model, const PREFIX(node) *x, BlasFu model->param.svm_type == NU_SVR || model->param.svm_type == SVDD_L1) dec_values = Malloc(double, 1); - else + else dec_values = Malloc(double, nr_class*(nr_class-1)/2); double pred_result = PREFIX(predict_values)(model, x, dec_values, blas_functions); free(dec_values); @@ -3074,10 +3083,10 @@ double PREFIX(predict_probability)( for(i=0;ilabel[prob_max_idx]; } - else + else return PREFIX(predict)(model, x, blas_functions); } @@ -3154,9 +3163,9 @@ const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_param svm_type != NU_SVR && svm_type != SVDD_L1) return "unknown svm type"; - + // kernel_type, degree - + int kernel_type = param->kernel_type; if(kernel_type != LINEAR && kernel_type != POLY && @@ -3210,7 +3219,7 @@ const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_param // check whether nu-svc is feasible - + if(svm_type == NU_SVC) { int l = prob->l; @@ -3244,7 +3253,7 @@ const char *PREFIX(check_parameter)(const PREFIX(problem) *prob, const svm_param ++nr_class; } } - + for(i=0;il != newprob.l && + else if(prob->l != newprob.l && svm_type == C_SVC) { bool only_one_label = true;