From 02fecc7a54521a4e493d0f948152c8ecd41b6e00 Mon Sep 17 00:00:00 2001
From: Ali TBER <ali.tber@ekimetrics.com>
Date: Mon, 28 Sep 2020 16:16:43 +0200
Subject: [PATCH 1/6] FEA cumulative_gain_curve

---
 sklearn/metrics/_ranking.py           | 69 +++++++++++++++++++++++++++
 sklearn/metrics/tests/test_ranking.py |  1 +
 2 files changed, 70 insertions(+)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index 36feb4e91a5db..f3734b85ad261 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -1006,6 +1006,75 @@ def roc_curve(
 
     return fpr, tpr, thresholds
 
+def cumulative_gain_curve(y_true, y_score, pos_label=None):
+    """This function generates the points necessary to plot the Cumulative Gain for each ten percent of the samples
+    Note: This implementation is restricted to the binary classification task.
+    
+    Parameters
+    ----------
+
+        y_true (array-like, shape (n_samples)): True labels of the data.
+        y_score (array-like, shape (n_samples)): Target scores, can either be
+            probability estimates of the positive class, confidence values, or
+            non-thresholded measure of decisions (as returned by
+            decision_function on some classifiers).
+        pos_label (int or str, default=None): Label considered as positive and
+            others are considered negative
+    Returns
+    -------
+        percentages (numpy.ndarray): An array containing the X-axis values for
+            plotting the Cumulative Gains chart.
+        gains (numpy.ndarray): An array containing the Y-axis values for one
+            curve of the Cumulative Gains chart.
+    Raises:
+        ValueError: If `y_true` is not composed of 2 classes. The Cumulative
+            Gain Chart is only relevant in binary classification.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import metrics
+    >>> y_true = [0, 1, 1, 0, 0, 0, 1, 1, 0, 0]
+    >>> y_pred = [0.1, 0.8, 0.9, 0,3, 0.4, 0.6, 0.6, 0.6, 0.44]
+    >>> percentages, gains = metrics.cumulative_gain_curve(y_true, y_pred, pos_label=1)
+    >>> percentages
+    array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
+    >>> gains
+    array([0.  , 0.  , 0.25, 0.5 , 0.5 , 0.75, 1.  , 1.  , 1.  , 1.  , 1.  ]
+
+    """
+    y_true, y_score = np.asarray(y_true), np.asarray(y_score)
+
+    # ensure binary classification if pos_label is not specified
+    classes = np.unique(y_true)
+    if (pos_label is None and
+        not (np.array_equal(classes, [0, 1]) or
+             np.array_equal(classes, [-1, 1]) or
+             np.array_equal(classes, [0]) or
+             np.array_equal(classes, [-1]) or
+             np.array_equal(classes, [1]))):
+        raise ValueError("Data is not binary and pos_label is not specified")
+    elif pos_label is None:
+        pos_label = 1.
+
+    # make y_true a boolean vector
+    y_true = (y_true == pos_label)
+
+    sorted_indices = np.argsort(y_score)[::-1]
+    y_true = y_true[sorted_indices]
+    gains = np.cumsum(y_true)
+
+    percentages = np.arange(start=1, stop=len(y_true) + 1)
+
+    gains = gains / float(np.sum(y_true))
+    percentages = percentages / float(len(y_true))
+
+    gains = np.insert(gains, 0, [0])
+    percentages = np.insert(percentages, 0, [0])
+
+    return percentages, gains
+ 
+
 
 def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None):
     """Compute ranking-based average precision.
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 01de37b189733..519187ac162c7 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -25,6 +25,7 @@
 from sklearn.metrics import det_curve
 from sklearn.metrics import label_ranking_average_precision_score
 from sklearn.metrics import precision_recall_curve
+from sklearn.metrics import cumulative_gain_curve
 from sklearn.metrics import label_ranking_loss
 from sklearn.metrics import roc_auc_score
 from sklearn.metrics import roc_curve

From 468ad22044d344e79cefe5cc0db45745502ba1c7 Mon Sep 17 00:00:00 2001
From: Ali TBER <ali.tber@ekimetrics.com>
Date: Mon, 1 Mar 2021 20:31:45 +0100
Subject: [PATCH 2/6] FEA cumulative_gain_curve

---
 sklearn/linear_model/sag_fast.pyx     | 1357 +++++++++++++++++++++++
 sklearn/metrics/__init__.py           |    1 +
 sklearn/metrics/ranking.py            | 1461 +++++++++++++++++++++++++
 sklearn/metrics/tests/test_ranking.py |    6 +-
 sklearn/utils/seq_dataset.pxd         |  116 ++
 sklearn/utils/seq_dataset.pyx         |  653 +++++++++++
 6 files changed, 3593 insertions(+), 1 deletion(-)
 create mode 100644 sklearn/linear_model/sag_fast.pyx
 create mode 100644 sklearn/metrics/ranking.py
 create mode 100644 sklearn/utils/seq_dataset.pxd
 create mode 100644 sklearn/utils/seq_dataset.pyx

diff --git a/sklearn/linear_model/sag_fast.pyx b/sklearn/linear_model/sag_fast.pyx
new file mode 100644
index 0000000000000..6d48b65bda560
--- /dev/null
+++ b/sklearn/linear_model/sag_fast.pyx
@@ -0,0 +1,1357 @@
+
+#------------------------------------------------------------------------------
+
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+#
+# Authors: Danny Sullivan <dbsullivan23@gmail.com>
+#          Tom Dupre la Tour <tom.dupre-la-tour@m4x.org>
+#          Arthur Mensch <arthur.mensch@m4x.org
+#
+# License: BSD 3 clause
+
+"""
+SAG and SAGA implementation
+WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+"""
+
+cimport numpy as np
+import numpy as np
+from libc.math cimport fabs, exp, log
+from libc.time cimport time, time_t
+
+from .sgd_fast cimport LossFunction
+from .sgd_fast cimport Log, SquaredLoss
+
+from ..utils.seq_dataset cimport SequentialDataset32, SequentialDataset64
+
+from libc.stdio cimport printf
+
+
+
+cdef extern from "sgd_fast_helpers.h":
+    bint skl_isfinite64(double) nogil
+
+
+cdef extern from "sgd_fast_helpers.h":
+    bint skl_isfinite32(float) nogil
+
+
+cdef inline double fmax64(double x, double y) nogil:
+    if x > y:
+        return x
+    return y
+
+cdef inline float fmax32(float x, float y) nogil:
+    if x > y:
+        return x
+    return y
+
+cdef double _logsumexp64(double* arr, int n_classes) nogil:
+    """Computes the sum of arr assuming arr is in the log domain.
+
+    Returns log(sum(exp(arr))) while minimizing the possibility of
+    over/underflow.
+    """
+    # Use the max to normalize, as with the log this is what accumulates
+    # the less errors
+    cdef double vmax = arr[0]
+    cdef double out = 0.0
+    cdef int i
+
+    for i in range(1, n_classes):
+        if vmax < arr[i]:
+            vmax = arr[i]
+
+    for i in range(n_classes):
+        out += exp(arr[i] - vmax)
+
+    return log(out) + vmax
+
+cdef float _logsumexp32(float* arr, int n_classes) nogil:
+    """Computes the sum of arr assuming arr is in the log domain.
+
+    Returns log(sum(exp(arr))) while minimizing the possibility of
+    over/underflow.
+    """
+    # Use the max to normalize, as with the log this is what accumulates
+    # the less errors
+    cdef float vmax = arr[0]
+    cdef float out = 0.0
+    cdef int i
+
+    for i in range(1, n_classes):
+        if vmax < arr[i]:
+            vmax = arr[i]
+
+    for i in range(n_classes):
+        out += exp(arr[i] - vmax)
+
+    return log(out) + vmax
+
+cdef class MultinomialLogLoss64:
+    cdef double _loss(self, double* prediction, double y, int n_classes,
+                      double sample_weight) nogil:
+        r"""Multinomial Logistic regression loss.
+
+        The multinomial logistic loss for one sample is:
+        loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction))
+             = sw (logsumexp(prediction) - prediction[y])
+
+        where:
+            prediction = dot(x_sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[double] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : double, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded).
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : double
+            Weight of current sample.
+
+        Returns
+        -------
+        loss : double
+            Multinomial loss for current sample.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes)
+        cdef double loss
+
+        # y is the indice of the correct class of current sample.
+        loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight
+        return loss
+
+    cdef void _dloss(self, double* prediction, double y, int n_classes,
+                     double sample_weight, double* gradient_ptr) nogil:
+        r"""Multinomial Logistic regression gradient of the loss.
+
+        The gradient of the multinomial logistic loss with respect to a class c,
+        and for one sample is:
+        grad_c = - sw * (p[c] - \delta_{y,c})
+
+        where:
+            p[c] = exp(logsumexp(prediction) - prediction[c])
+            prediction = dot(sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Note that to obtain the true gradient, this value has to be multiplied
+        by the sample vector x.
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[double] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : double, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded)
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : double
+            Weight of current sample.
+
+        gradient_ptr : pointer to a np.ndarray[double] of shape (n_classes,)
+            Gradient vector to be filled.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes)
+        cdef int class_ind
+
+        for class_ind in range(n_classes):
+            gradient_ptr[class_ind] = exp(prediction[class_ind] -
+                                          logsumexp_prediction)
+
+            # y is the indice of the correct class of current sample.
+            if class_ind == y:
+                gradient_ptr[class_ind] -= 1.0
+
+            gradient_ptr[class_ind] *= sample_weight
+
+    def __reduce__(self):
+        return MultinomialLogLoss64, ()
+
+cdef class MultinomialLogLoss32:
+    cdef float _loss(self, float* prediction, float y, int n_classes,
+                      float sample_weight) nogil:
+        r"""Multinomial Logistic regression loss.
+
+        The multinomial logistic loss for one sample is:
+        loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction))
+             = sw (logsumexp(prediction) - prediction[y])
+
+        where:
+            prediction = dot(x_sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[float] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : float, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded).
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : float
+            Weight of current sample.
+
+        Returns
+        -------
+        loss : float
+            Multinomial loss for current sample.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes)
+        cdef float loss
+
+        # y is the indice of the correct class of current sample.
+        loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight
+        return loss
+
+    cdef void _dloss(self, float* prediction, float y, int n_classes,
+                     float sample_weight, float* gradient_ptr) nogil:
+        r"""Multinomial Logistic regression gradient of the loss.
+
+        The gradient of the multinomial logistic loss with respect to a class c,
+        and for one sample is:
+        grad_c = - sw * (p[c] - \delta_{y,c})
+
+        where:
+            p[c] = exp(logsumexp(prediction) - prediction[c])
+            prediction = dot(sample, weights) + intercept
+            \delta_{y,c} = 1 if (y == c) else 0
+            sw = sample_weight
+
+        Note that to obtain the true gradient, this value has to be multiplied
+        by the sample vector x.
+
+        Parameters
+        ----------
+        prediction : pointer to a np.ndarray[float] of shape (n_classes,)
+            Prediction of the multinomial classifier, for current sample.
+
+        y : float, between 0 and n_classes - 1
+            Indice of the correct class for current sample (i.e. label encoded)
+
+        n_classes : integer
+            Total number of classes.
+
+        sample_weight : float
+            Weight of current sample.
+
+        gradient_ptr : pointer to a np.ndarray[float] of shape (n_classes,)
+            Gradient vector to be filled.
+
+        Reference
+        ---------
+        Bishop, C. M. (2006). Pattern recognition and machine learning.
+        Springer. (Chapter 4.3.4)
+        """
+        cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes)
+        cdef int class_ind
+
+        for class_ind in range(n_classes):
+            gradient_ptr[class_ind] = exp(prediction[class_ind] -
+                                          logsumexp_prediction)
+
+            # y is the indice of the correct class of current sample.
+            if class_ind == y:
+                gradient_ptr[class_ind] -= 1.0
+
+            gradient_ptr[class_ind] *= sample_weight
+
+    def __reduce__(self):
+        return MultinomialLogLoss32, ()
+
+cdef inline double _soft_thresholding64(double x, double shrinkage) nogil:
+    return fmax64(x - shrinkage, 0) - fmax64(- x - shrinkage, 0)
+
+cdef inline float _soft_thresholding32(float x, float shrinkage) nogil:
+    return fmax32(x - shrinkage, 0) - fmax32(- x - shrinkage, 0)
+
+def sag64(SequentialDataset64 dataset,
+        np.ndarray[double, ndim=2, mode='c'] weights_array,
+        np.ndarray[double, ndim=1, mode='c'] intercept_array,
+        int n_samples,
+        int n_features,
+        int n_classes,
+        double tol,
+        int max_iter,
+        str loss_function,
+        double step_size,
+        double alpha,
+        double beta,
+        np.ndarray[double, ndim=2, mode='c'] sum_gradient_init,
+        np.ndarray[double, ndim=2, mode='c'] gradient_memory_init,
+        np.ndarray[bint, ndim=1, mode='c'] seen_init,
+        int num_seen,
+        bint fit_intercept,
+        np.ndarray[double, ndim=1, mode='c'] intercept_sum_gradient_init,
+        double intercept_decay,
+        bint saga,
+        bint verbose):
+    """Stochastic Average Gradient (SAG) and SAGA solvers.
+
+    Used in Ridge and LogisticRegression.
+
+    Reference
+    ---------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+    (section 4.3)
+
+    Defazio, A., Bach, F., Lacoste-Julien, S. (2014),
+    SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives
+    https://arxiv.org/abs/1407.0202
+
+    """
+    # the data pointer for x, the current sample
+    cdef double *x_data_ptr = NULL
+    # the index pointer for the column of the data
+    cdef int *x_ind_ptr = NULL
+    # the number of non-zero features for current sample
+    cdef int xnnz = -1
+    # the label value for current sample
+    # the label value for curent sample
+    cdef double y
+    # the sample weight
+    cdef double sample_weight
+
+    # helper variable for indexes
+    cdef int f_idx, s_idx, feature_ind, class_ind, j
+    # the number of pass through all samples
+    cdef int n_iter = 0
+    # helper to track iterations through samples
+    cdef int sample_itr
+    # the index (row number) of the current sample
+    cdef int sample_ind
+
+    # the maximum change in weights, used to compute stopping criteria
+    cdef double max_change
+    # a holder variable for the max weight, used to compute stopping criteria
+    cdef double max_weight
+
+    # the start time of the fit
+    cdef time_t start_time
+    # the end time of the fit
+    cdef time_t end_time
+
+    # precomputation since the step size does not change in this implementation
+    cdef double wscale_update = 1.0 - step_size * alpha
+
+    # vector of booleans indicating whether this sample has been seen
+    cdef bint* seen = <bint*> seen_init.data
+
+    # helper for cumulative sum
+    cdef double cum_sum
+
+    # the pointer to the coef_ or weights
+    cdef double* weights = <double * >weights_array.data
+    # the pointer to the intercept_array
+    cdef double* intercept = <double * >intercept_array.data
+
+    # the pointer to the intercept_sum_gradient
+    cdef double* intercept_sum_gradient = \
+        <double * >intercept_sum_gradient_init.data
+
+    # the sum of gradients for each feature
+    cdef double* sum_gradient = <double*> sum_gradient_init.data
+    # the previously seen gradient for each sample
+    cdef double* gradient_memory = <double*> gradient_memory_init.data
+
+    # the cumulative sums needed for JIT params
+    cdef np.ndarray[double, ndim=1] cumulative_sums_array = \
+        np.empty(n_samples, dtype=np.float64, order="c")
+    cdef double* cumulative_sums = <double*> cumulative_sums_array.data
+
+    # the index for the last time this feature was updated
+    cdef np.ndarray[int, ndim=1] feature_hist_array = \
+        np.zeros(n_features, dtype=np.int32, order="c")
+    cdef int* feature_hist = <int*> feature_hist_array.data
+
+    # the previous weights to use to compute stopping criteria
+    cdef np.ndarray[double, ndim=2] previous_weights_array = \
+        np.zeros((n_features, n_classes), dtype=np.float64, order="c")
+    cdef double* previous_weights = <double*> previous_weights_array.data
+
+    cdef np.ndarray[double, ndim=1] prediction_array = \
+        np.zeros(n_classes, dtype=np.float64, order="c")
+    cdef double* prediction = <double*> prediction_array.data
+
+    cdef np.ndarray[double, ndim=1] gradient_array = \
+        np.zeros(n_classes, dtype=np.float64, order="c")
+    cdef double* gradient = <double*> gradient_array.data
+
+    # Intermediate variable that need declaration since cython cannot infer when templating
+    cdef double val
+
+    # Bias correction term in saga
+    cdef double gradient_correction
+
+    # the scalar used for multiplying z
+    cdef double wscale = 1.0
+
+    # return value (-1 if an error occurred, 0 otherwise)
+    cdef int status = 0
+
+    # the cumulative sums for each iteration for the sparse implementation
+    cumulative_sums[0] = 0.0
+
+    # the multipliative scale needed for JIT params
+    cdef np.ndarray[double, ndim=1] cumulative_sums_prox_array
+    cdef double* cumulative_sums_prox
+
+    cdef bint prox = beta > 0 and saga
+
+    # Loss function to optimize
+    cdef LossFunction loss
+    # Wether the loss function is multinomial
+    cdef bint multinomial = False
+    # Multinomial loss function
+    cdef MultinomialLogLoss64 multiloss
+
+    if loss_function == "multinomial":
+        multinomial = True
+        multiloss = MultinomialLogLoss64()
+    elif loss_function == "log":
+        loss = Log()
+    elif loss_function == "squared":
+        loss = SquaredLoss()
+    else:
+        raise ValueError("Invalid loss parameter: got %s instead of "
+                         "one of ('log', 'squared', 'multinomial')"
+                         % loss_function)
+
+    if prox:
+        cumulative_sums_prox_array = np.empty(n_samples,
+                                              dtype=np.float64, order="c")
+        cumulative_sums_prox = <double*> cumulative_sums_prox_array.data
+    else:
+        cumulative_sums_prox = NULL
+
+    with nogil:
+        start_time = time(NULL)
+        for n_iter in range(max_iter):
+            for sample_itr in range(n_samples):
+                # extract a random sample
+                sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz,
+                                              &y, &sample_weight)
+
+                # cached index for gradient_memory
+                s_idx = sample_ind * n_classes
+
+                # update the number of samples seen and the seen array
+                if seen[sample_ind] == 0:
+                    num_seen += 1
+                    seen[sample_ind] = 1
+
+                # make the weight updates
+                if sample_itr > 0:
+                   status = lagged_update64(weights, wscale, xnnz,
+                                                  n_samples, n_classes,
+                                                  sample_itr,
+                                                  cumulative_sums,
+                                                  cumulative_sums_prox,
+                                                  feature_hist,
+                                                  prox,
+                                                  sum_gradient,
+                                                  x_ind_ptr,
+                                                  False,
+                                                  n_iter)
+                   if status == -1:
+                       break
+
+                # find the current prediction
+                predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
+                                       intercept, prediction, n_classes)
+
+                # compute the gradient for this sample, given the prediction
+                if multinomial:
+                    multiloss._dloss(prediction, y, n_classes, sample_weight,
+                                     gradient)
+                else:
+                    gradient[0] = loss._dloss(prediction[0], y) * sample_weight
+
+                # L2 regularization by simply rescaling the weights
+                wscale *= wscale_update
+
+                # make the updates to the sum of gradients
+                for j in range(xnnz):
+                    feature_ind = x_ind_ptr[j]
+                    val = x_data_ptr[j]
+                    f_idx = feature_ind * n_classes
+                    for class_ind in range(n_classes):
+                        gradient_correction = \
+                            val * (gradient[class_ind] -
+                                   gradient_memory[s_idx + class_ind])
+                        if saga:
+                            weights[f_idx + class_ind] -= \
+                                (gradient_correction * step_size
+                                 * (1 - 1. / num_seen) / wscale)
+                        sum_gradient[f_idx + class_ind] += gradient_correction
+
+                # fit the intercept
+                if fit_intercept:
+                    for class_ind in range(n_classes):
+                        gradient_correction = (gradient[class_ind] -
+                                               gradient_memory[s_idx + class_ind])
+                        intercept_sum_gradient[class_ind] += gradient_correction
+                        gradient_correction *= step_size * (1. - 1. / num_seen)
+                        if saga:
+                            intercept[class_ind] -= \
+                                (step_size * intercept_sum_gradient[class_ind] /
+                                 num_seen * intercept_decay) + gradient_correction
+                        else:
+                            intercept[class_ind] -= \
+                                (step_size * intercept_sum_gradient[class_ind] /
+                                 num_seen * intercept_decay)
+
+                        # check to see that the intercept is not inf or NaN
+                        if not skl_isfinite64(intercept[class_ind]):
+                            status = -1
+                            break
+                    # Break from the n_samples outer loop if an error happened
+                    # in the fit_intercept n_classes inner loop
+                    if status == -1:
+                        break
+
+                # update the gradient memory for this sample
+                for class_ind in range(n_classes):
+                    gradient_memory[s_idx + class_ind] = gradient[class_ind]
+
+                if sample_itr == 0:
+                    cumulative_sums[0] = step_size / (wscale * num_seen)
+                    if prox:
+                        cumulative_sums_prox[0] = step_size * beta / wscale
+                else:
+                    cumulative_sums[sample_itr] = \
+                        (cumulative_sums[sample_itr - 1] +
+                         step_size / (wscale * num_seen))
+                    if prox:
+                        cumulative_sums_prox[sample_itr] = \
+                        (cumulative_sums_prox[sample_itr - 1] +
+                             step_size * beta / wscale)
+                # If wscale gets too small, we need to reset the scale.
+                if wscale < 1e-9:
+                    if verbose:
+                        with gil:
+                            print("rescaling...")
+                    status = scale_weights64(
+                        weights, &wscale, n_features, n_samples, n_classes,
+                        sample_itr, cumulative_sums,
+                        cumulative_sums_prox,
+                        feature_hist,
+                        prox, sum_gradient, n_iter)
+                    if status == -1:
+                        break
+
+            # Break from the n_iter outer loop if an error happened in the
+            # n_samples inner loop
+            if status == -1:
+                break
+
+            # we scale the weights every n_samples iterations and reset the
+            # just-in-time update system for numerical stability.
+            status = scale_weights64(weights, &wscale, n_features,
+                                           n_samples,
+                                           n_classes, n_samples - 1,
+                                           cumulative_sums,
+                                           cumulative_sums_prox,
+                                           feature_hist,
+                                           prox, sum_gradient, n_iter)
+
+            if status == -1:
+                break
+            # check if the stopping criteria is reached
+            max_change = 0.0
+            max_weight = 0.0
+            for idx in range(n_features * n_classes):
+                max_weight = fmax64(max_weight, fabs(weights[idx]))
+                max_change = fmax64(max_change,
+                                  fabs(weights[idx] -
+                                       previous_weights[idx]))
+                previous_weights[idx] = weights[idx]
+            if ((max_weight != 0 and max_change / max_weight <= tol)
+                or max_weight == 0 and max_change == 0):
+                if verbose:
+                    end_time = time(NULL)
+                    with gil:
+                        print("convergence after %d epochs took %d seconds" %
+                              (n_iter + 1, end_time - start_time))
+                break
+            elif verbose:
+                printf('Epoch %d, change: %.8f\n', n_iter + 1,
+                                                  max_change / max_weight)
+    n_iter += 1
+    # We do the error treatment here based on error code in status to avoid
+    # re-acquiring the GIL within the cython code, which slows the computation
+    # when the sag/saga solver is used concurrently in multiple Python threads.
+    if status == -1:
+        raise ValueError(("Floating-point under-/overflow occurred at epoch"
+                          " #%d. Scaling input data with StandardScaler or"
+                          " MinMaxScaler might help.") % n_iter)
+
+    if verbose and n_iter >= max_iter:
+        end_time = time(NULL)
+        print(("max_iter reached after %d seconds") %
+              (end_time - start_time))
+
+    return num_seen, n_iter
+
+def sag32(SequentialDataset32 dataset,
+        np.ndarray[float, ndim=2, mode='c'] weights_array,
+        np.ndarray[float, ndim=1, mode='c'] intercept_array,
+        int n_samples,
+        int n_features,
+        int n_classes,
+        double tol,
+        int max_iter,
+        str loss_function,
+        double step_size,
+        double alpha,
+        double beta,
+        np.ndarray[float, ndim=2, mode='c'] sum_gradient_init,
+        np.ndarray[float, ndim=2, mode='c'] gradient_memory_init,
+        np.ndarray[bint, ndim=1, mode='c'] seen_init,
+        int num_seen,
+        bint fit_intercept,
+        np.ndarray[float, ndim=1, mode='c'] intercept_sum_gradient_init,
+        double intercept_decay,
+        bint saga,
+        bint verbose):
+    """Stochastic Average Gradient (SAG) and SAGA solvers.
+
+    Used in Ridge and LogisticRegression.
+
+    Reference
+    ---------
+    Schmidt, M., Roux, N. L., & Bach, F. (2013).
+    Minimizing finite sums with the stochastic average gradient
+    https://hal.inria.fr/hal-00860051/document
+    (section 4.3)
+
+    Defazio, A., Bach, F., Lacoste-Julien, S. (2014),
+    SAGA: A Fast Incremental Gradient Method With Support
+    for Non-Strongly Convex Composite Objectives
+    https://arxiv.org/abs/1407.0202
+
+    """
+    # the data pointer for x, the current sample
+    cdef float *x_data_ptr = NULL
+    # the index pointer for the column of the data
+    cdef int *x_ind_ptr = NULL
+    # the number of non-zero features for current sample
+    cdef int xnnz = -1
+    # the label value for current sample
+    # the label value for curent sample
+    cdef float y
+    # the sample weight
+    cdef float sample_weight
+
+    # helper variable for indexes
+    cdef int f_idx, s_idx, feature_ind, class_ind, j
+    # the number of pass through all samples
+    cdef int n_iter = 0
+    # helper to track iterations through samples
+    cdef int sample_itr
+    # the index (row number) of the current sample
+    cdef int sample_ind
+
+    # the maximum change in weights, used to compute stopping criteria
+    cdef float max_change
+    # a holder variable for the max weight, used to compute stopping criteria
+    cdef float max_weight
+
+    # the start time of the fit
+    cdef time_t start_time
+    # the end time of the fit
+    cdef time_t end_time
+
+    # precomputation since the step size does not change in this implementation
+    cdef float wscale_update = 1.0 - step_size * alpha
+
+    # vector of booleans indicating whether this sample has been seen
+    cdef bint* seen = <bint*> seen_init.data
+
+    # helper for cumulative sum
+    cdef float cum_sum
+
+    # the pointer to the coef_ or weights
+    cdef float* weights = <float * >weights_array.data
+    # the pointer to the intercept_array
+    cdef float* intercept = <float * >intercept_array.data
+
+    # the pointer to the intercept_sum_gradient
+    cdef float* intercept_sum_gradient = \
+        <float * >intercept_sum_gradient_init.data
+
+    # the sum of gradients for each feature
+    cdef float* sum_gradient = <float*> sum_gradient_init.data
+    # the previously seen gradient for each sample
+    cdef float* gradient_memory = <float*> gradient_memory_init.data
+
+    # the cumulative sums needed for JIT params
+    cdef np.ndarray[float, ndim=1] cumulative_sums_array = \
+        np.empty(n_samples, dtype=np.float32, order="c")
+    cdef float* cumulative_sums = <float*> cumulative_sums_array.data
+
+    # the index for the last time this feature was updated
+    cdef np.ndarray[int, ndim=1] feature_hist_array = \
+        np.zeros(n_features, dtype=np.int32, order="c")
+    cdef int* feature_hist = <int*> feature_hist_array.data
+
+    # the previous weights to use to compute stopping criteria
+    cdef np.ndarray[float, ndim=2] previous_weights_array = \
+        np.zeros((n_features, n_classes), dtype=np.float32, order="c")
+    cdef float* previous_weights = <float*> previous_weights_array.data
+
+    cdef np.ndarray[float, ndim=1] prediction_array = \
+        np.zeros(n_classes, dtype=np.float32, order="c")
+    cdef float* prediction = <float*> prediction_array.data
+
+    cdef np.ndarray[float, ndim=1] gradient_array = \
+        np.zeros(n_classes, dtype=np.float32, order="c")
+    cdef float* gradient = <float*> gradient_array.data
+
+    # Intermediate variable that need declaration since cython cannot infer when templating
+    cdef float val
+
+    # Bias correction term in saga
+    cdef float gradient_correction
+
+    # the scalar used for multiplying z
+    cdef float wscale = 1.0
+
+    # return value (-1 if an error occurred, 0 otherwise)
+    cdef int status = 0
+
+    # the cumulative sums for each iteration for the sparse implementation
+    cumulative_sums[0] = 0.0
+
+    # the multipliative scale needed for JIT params
+    cdef np.ndarray[float, ndim=1] cumulative_sums_prox_array
+    cdef float* cumulative_sums_prox
+
+    cdef bint prox = beta > 0 and saga
+
+    # Loss function to optimize
+    cdef LossFunction loss
+    # Wether the loss function is multinomial
+    cdef bint multinomial = False
+    # Multinomial loss function
+    cdef MultinomialLogLoss32 multiloss
+
+    if loss_function == "multinomial":
+        multinomial = True
+        multiloss = MultinomialLogLoss32()
+    elif loss_function == "log":
+        loss = Log()
+    elif loss_function == "squared":
+        loss = SquaredLoss()
+    else:
+        raise ValueError("Invalid loss parameter: got %s instead of "
+                         "one of ('log', 'squared', 'multinomial')"
+                         % loss_function)
+
+    if prox:
+        cumulative_sums_prox_array = np.empty(n_samples,
+                                              dtype=np.float32, order="c")
+        cumulative_sums_prox = <float*> cumulative_sums_prox_array.data
+    else:
+        cumulative_sums_prox = NULL
+
+    with nogil:
+        start_time = time(NULL)
+        for n_iter in range(max_iter):
+            for sample_itr in range(n_samples):
+                # extract a random sample
+                sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz,
+                                              &y, &sample_weight)
+
+                # cached index for gradient_memory
+                s_idx = sample_ind * n_classes
+
+                # update the number of samples seen and the seen array
+                if seen[sample_ind] == 0:
+                    num_seen += 1
+                    seen[sample_ind] = 1
+
+                # make the weight updates
+                if sample_itr > 0:
+                   status = lagged_update32(weights, wscale, xnnz,
+                                                  n_samples, n_classes,
+                                                  sample_itr,
+                                                  cumulative_sums,
+                                                  cumulative_sums_prox,
+                                                  feature_hist,
+                                                  prox,
+                                                  sum_gradient,
+                                                  x_ind_ptr,
+                                                  False,
+                                                  n_iter)
+                   if status == -1:
+                       break
+
+                # find the current prediction
+                predict_sample32(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
+                                       intercept, prediction, n_classes)
+
+                # compute the gradient for this sample, given the prediction
+                if multinomial:
+                    multiloss._dloss(prediction, y, n_classes, sample_weight,
+                                     gradient)
+                else:
+                    gradient[0] = loss._dloss(prediction[0], y) * sample_weight
+
+                # L2 regularization by simply rescaling the weights
+                wscale *= wscale_update
+
+                # make the updates to the sum of gradients
+                for j in range(xnnz):
+                    feature_ind = x_ind_ptr[j]
+                    val = x_data_ptr[j]
+                    f_idx = feature_ind * n_classes
+                    for class_ind in range(n_classes):
+                        gradient_correction = \
+                            val * (gradient[class_ind] -
+                                   gradient_memory[s_idx + class_ind])
+                        if saga:
+                            weights[f_idx + class_ind] -= \
+                                (gradient_correction * step_size
+                                 * (1 - 1. / num_seen) / wscale)
+                        sum_gradient[f_idx + class_ind] += gradient_correction
+
+                # fit the intercept
+                if fit_intercept:
+                    for class_ind in range(n_classes):
+                        gradient_correction = (gradient[class_ind] -
+                                               gradient_memory[s_idx + class_ind])
+                        intercept_sum_gradient[class_ind] += gradient_correction
+                        gradient_correction *= step_size * (1. - 1. / num_seen)
+                        if saga:
+                            intercept[class_ind] -= \
+                                (step_size * intercept_sum_gradient[class_ind] /
+                                 num_seen * intercept_decay) + gradient_correction
+                        else:
+                            intercept[class_ind] -= \
+                                (step_size * intercept_sum_gradient[class_ind] /
+                                 num_seen * intercept_decay)
+
+                        # check to see that the intercept is not inf or NaN
+                        if not skl_isfinite32(intercept[class_ind]):
+                            status = -1
+                            break
+                    # Break from the n_samples outer loop if an error happened
+                    # in the fit_intercept n_classes inner loop
+                    if status == -1:
+                        break
+
+                # update the gradient memory for this sample
+                for class_ind in range(n_classes):
+                    gradient_memory[s_idx + class_ind] = gradient[class_ind]
+
+                if sample_itr == 0:
+                    cumulative_sums[0] = step_size / (wscale * num_seen)
+                    if prox:
+                        cumulative_sums_prox[0] = step_size * beta / wscale
+                else:
+                    cumulative_sums[sample_itr] = \
+                        (cumulative_sums[sample_itr - 1] +
+                         step_size / (wscale * num_seen))
+                    if prox:
+                        cumulative_sums_prox[sample_itr] = \
+                        (cumulative_sums_prox[sample_itr - 1] +
+                             step_size * beta / wscale)
+                # If wscale gets too small, we need to reset the scale.
+                if wscale < 1e-9:
+                    if verbose:
+                        with gil:
+                            print("rescaling...")
+                    status = scale_weights32(
+                        weights, &wscale, n_features, n_samples, n_classes,
+                        sample_itr, cumulative_sums,
+                        cumulative_sums_prox,
+                        feature_hist,
+                        prox, sum_gradient, n_iter)
+                    if status == -1:
+                        break
+
+            # Break from the n_iter outer loop if an error happened in the
+            # n_samples inner loop
+            if status == -1:
+                break
+
+            # we scale the weights every n_samples iterations and reset the
+            # just-in-time update system for numerical stability.
+            status = scale_weights32(weights, &wscale, n_features,
+                                           n_samples,
+                                           n_classes, n_samples - 1,
+                                           cumulative_sums,
+                                           cumulative_sums_prox,
+                                           feature_hist,
+                                           prox, sum_gradient, n_iter)
+
+            if status == -1:
+                break
+            # check if the stopping criteria is reached
+            max_change = 0.0
+            max_weight = 0.0
+            for idx in range(n_features * n_classes):
+                max_weight = fmax32(max_weight, fabs(weights[idx]))
+                max_change = fmax32(max_change,
+                                  fabs(weights[idx] -
+                                       previous_weights[idx]))
+                previous_weights[idx] = weights[idx]
+            if ((max_weight != 0 and max_change / max_weight <= tol)
+                or max_weight == 0 and max_change == 0):
+                if verbose:
+                    end_time = time(NULL)
+                    with gil:
+                        print("convergence after %d epochs took %d seconds" %
+                              (n_iter + 1, end_time - start_time))
+                break
+            elif verbose:
+                printf('Epoch %d, change: %.8f\n', n_iter + 1,
+                                                  max_change / max_weight)
+    n_iter += 1
+    # We do the error treatment here based on error code in status to avoid
+    # re-acquiring the GIL within the cython code, which slows the computation
+    # when the sag/saga solver is used concurrently in multiple Python threads.
+    if status == -1:
+        raise ValueError(("Floating-point under-/overflow occurred at epoch"
+                          " #%d. Scaling input data with StandardScaler or"
+                          " MinMaxScaler might help.") % n_iter)
+
+    if verbose and n_iter >= max_iter:
+        end_time = time(NULL)
+        print(("max_iter reached after %d seconds") %
+              (end_time - start_time))
+
+    return num_seen, n_iter
+
+cdef int scale_weights64(double* weights, double* wscale,
+                               int n_features,
+                               int n_samples, int n_classes, int sample_itr,
+                               double* cumulative_sums,
+                               double* cumulative_sums_prox,
+                               int* feature_hist,
+                               bint prox,
+                               double* sum_gradient,
+                               int n_iter) nogil:
+    """Scale the weights with wscale for numerical stability.
+
+    wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
+    can become very small, so we reset it every n_samples iterations to 1.0 for
+    numerical stability. To be able to scale, we first need to update every
+    coefficients and reset the just-in-time update system.
+    This also limits the size of `cumulative_sums`.
+    """
+
+    cdef int status
+    status = lagged_update64(weights, wscale[0], n_features,
+                                   n_samples, n_classes, sample_itr + 1,
+                                   cumulative_sums,
+                                   cumulative_sums_prox,
+                                   feature_hist,
+                                   prox,
+                                   sum_gradient,
+                                   NULL,
+                                   True,
+                                   n_iter)
+    # if lagged update succeeded, reset wscale to 1.0
+    if status == 0:
+        wscale[0] = 1.0
+    return status
+
+cdef int scale_weights32(float* weights, float* wscale,
+                               int n_features,
+                               int n_samples, int n_classes, int sample_itr,
+                               float* cumulative_sums,
+                               float* cumulative_sums_prox,
+                               int* feature_hist,
+                               bint prox,
+                               float* sum_gradient,
+                               int n_iter) nogil:
+    """Scale the weights with wscale for numerical stability.
+
+    wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr)
+    can become very small, so we reset it every n_samples iterations to 1.0 for
+    numerical stability. To be able to scale, we first need to update every
+    coefficients and reset the just-in-time update system.
+    This also limits the size of `cumulative_sums`.
+    """
+
+    cdef int status
+    status = lagged_update32(weights, wscale[0], n_features,
+                                   n_samples, n_classes, sample_itr + 1,
+                                   cumulative_sums,
+                                   cumulative_sums_prox,
+                                   feature_hist,
+                                   prox,
+                                   sum_gradient,
+                                   NULL,
+                                   True,
+                                   n_iter)
+    # if lagged update succeeded, reset wscale to 1.0
+    if status == 0:
+        wscale[0] = 1.0
+    return status
+
+cdef int lagged_update64(double* weights, double wscale, int xnnz,
+                               int n_samples, int n_classes, int sample_itr,
+                               double* cumulative_sums,
+                               double* cumulative_sums_prox,
+                               int* feature_hist,
+                               bint prox,
+                               double* sum_gradient,
+                               int* x_ind_ptr,
+                               bint reset,
+                               int n_iter) nogil:
+    """Hard perform the JIT updates for non-zero features of present sample.
+    The updates that awaits are kept in memory using cumulative_sums,
+    cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
+    (Defazio et al. 2014) for details. If reset=True, we also reset wscale to
+    1 (this is done at the end of each epoch).
+    """
+    cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
+    cdef double cum_sum, grad_step, prox_step, cum_sum_prox
+    for feature_ind in range(xnnz):
+        if not reset:
+            feature_ind = x_ind_ptr[feature_ind]
+        f_idx = feature_ind * n_classes
+
+        cum_sum = cumulative_sums[sample_itr - 1]
+        if prox:
+            cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
+        if feature_hist[feature_ind] != 0:
+            cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
+            if prox:
+                cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
+        if not prox:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                weights[idx] -= cum_sum * sum_gradient[idx]
+                if reset:
+                    weights[idx] *= wscale
+                    if not skl_isfinite64(weights[idx]):
+                        # returning here does not require the gil as the return
+                        # type is a C integer
+                        return -1
+        else:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
+                    # In this case, we can perform all the gradient steps and
+                    # all the proximal steps in this order, which is more
+                    # efficient than unrolling all the lagged updates.
+                    # Idea taken from scikit-learn-contrib/lightning.
+                    weights[idx] -= cum_sum * sum_gradient[idx]
+                    weights[idx] = _soft_thresholding64(weights[idx],
+                                                      cum_sum_prox)
+                else:
+                    last_update_ind = feature_hist[feature_ind]
+                    if last_update_ind == -1:
+                        last_update_ind = sample_itr - 1
+                    for lagged_ind in range(sample_itr - 1,
+                                   last_update_ind - 1, -1):
+                        if lagged_ind > 0:
+                            grad_step = (cumulative_sums[lagged_ind]
+                               - cumulative_sums[lagged_ind - 1])
+                            prox_step = (cumulative_sums_prox[lagged_ind]
+                               - cumulative_sums_prox[lagged_ind - 1])
+                        else:
+                            grad_step = cumulative_sums[lagged_ind]
+                            prox_step = cumulative_sums_prox[lagged_ind]
+                        weights[idx] -= sum_gradient[idx] * grad_step
+                        weights[idx] = _soft_thresholding64(weights[idx],
+                                                          prox_step)
+
+                if reset:
+                    weights[idx] *= wscale
+                    # check to see that the weight is not inf or NaN
+                    if not skl_isfinite64(weights[idx]):
+                        return -1
+        if reset:
+            feature_hist[feature_ind] = sample_itr % n_samples
+        else:
+            feature_hist[feature_ind] = sample_itr
+
+    if reset:
+        cumulative_sums[sample_itr - 1] = 0.0
+        if prox:
+            cumulative_sums_prox[sample_itr - 1] = 0.0
+
+    return 0
+
+cdef int lagged_update32(float* weights, float wscale, int xnnz,
+                               int n_samples, int n_classes, int sample_itr,
+                               float* cumulative_sums,
+                               float* cumulative_sums_prox,
+                               int* feature_hist,
+                               bint prox,
+                               float* sum_gradient,
+                               int* x_ind_ptr,
+                               bint reset,
+                               int n_iter) nogil:
+    """Hard perform the JIT updates for non-zero features of present sample.
+    The updates that awaits are kept in memory using cumulative_sums,
+    cumulative_sums_prox, wscale and feature_hist. See original SAGA paper
+    (Defazio et al. 2014) for details. If reset=True, we also reset wscale to
+    1 (this is done at the end of each epoch).
+    """
+    cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind
+    cdef float cum_sum, grad_step, prox_step, cum_sum_prox
+    for feature_ind in range(xnnz):
+        if not reset:
+            feature_ind = x_ind_ptr[feature_ind]
+        f_idx = feature_ind * n_classes
+
+        cum_sum = cumulative_sums[sample_itr - 1]
+        if prox:
+            cum_sum_prox = cumulative_sums_prox[sample_itr - 1]
+        if feature_hist[feature_ind] != 0:
+            cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1]
+            if prox:
+                cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1]
+        if not prox:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                weights[idx] -= cum_sum * sum_gradient[idx]
+                if reset:
+                    weights[idx] *= wscale
+                    if not skl_isfinite32(weights[idx]):
+                        # returning here does not require the gil as the return
+                        # type is a C integer
+                        return -1
+        else:
+            for class_ind in range(n_classes):
+                idx = f_idx + class_ind
+                if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox:
+                    # In this case, we can perform all the gradient steps and
+                    # all the proximal steps in this order, which is more
+                    # efficient than unrolling all the lagged updates.
+                    # Idea taken from scikit-learn-contrib/lightning.
+                    weights[idx] -= cum_sum * sum_gradient[idx]
+                    weights[idx] = _soft_thresholding32(weights[idx],
+                                                      cum_sum_prox)
+                else:
+                    last_update_ind = feature_hist[feature_ind]
+                    if last_update_ind == -1:
+                        last_update_ind = sample_itr - 1
+                    for lagged_ind in range(sample_itr - 1,
+                                   last_update_ind - 1, -1):
+                        if lagged_ind > 0:
+                            grad_step = (cumulative_sums[lagged_ind]
+                               - cumulative_sums[lagged_ind - 1])
+                            prox_step = (cumulative_sums_prox[lagged_ind]
+                               - cumulative_sums_prox[lagged_ind - 1])
+                        else:
+                            grad_step = cumulative_sums[lagged_ind]
+                            prox_step = cumulative_sums_prox[lagged_ind]
+                        weights[idx] -= sum_gradient[idx] * grad_step
+                        weights[idx] = _soft_thresholding32(weights[idx],
+                                                          prox_step)
+
+                if reset:
+                    weights[idx] *= wscale
+                    # check to see that the weight is not inf or NaN
+                    if not skl_isfinite32(weights[idx]):
+                        return -1
+        if reset:
+            feature_hist[feature_ind] = sample_itr % n_samples
+        else:
+            feature_hist[feature_ind] = sample_itr
+
+    if reset:
+        cumulative_sums[sample_itr - 1] = 0.0
+        if prox:
+            cumulative_sums_prox[sample_itr - 1] = 0.0
+
+    return 0
+
+cdef void predict_sample64(double* x_data_ptr, int* x_ind_ptr, int xnnz,
+                                 double* w_data_ptr, double wscale,
+                                 double* intercept, double* prediction,
+                                 int n_classes) nogil:
+    """Compute the prediction given sparse sample x and dense weight w.
+
+    Parameters
+    ----------
+    x_data_ptr : pointer
+        Pointer to the data of the sample x
+
+    x_ind_ptr : pointer
+        Pointer to the indices of the sample  x
+
+    xnnz : int
+        Number of non-zero element in the sample  x
+
+    w_data_ptr : pointer
+        Pointer to the data of the weights w
+
+    wscale : double
+        Scale of the weights w
+
+    intercept : pointer
+        Pointer to the intercept
+
+    prediction : pointer
+        Pointer to store the resulting prediction
+
+    n_classes : int
+        Number of classes in multinomial case. Equals 1 in binary case.
+
+    """
+    cdef int feature_ind, class_ind, j
+    cdef double innerprod
+
+    for class_ind in range(n_classes):
+        innerprod = 0.0
+        # Compute the dot product only on non-zero elements of x
+        for j in range(xnnz):
+            feature_ind = x_ind_ptr[j]
+            innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
+                          x_data_ptr[j])
+
+        prediction[class_ind] = wscale * innerprod + intercept[class_ind]
+
+
+cdef void predict_sample32(float* x_data_ptr, int* x_ind_ptr, int xnnz,
+                                 float* w_data_ptr, float wscale,
+                                 float* intercept, float* prediction,
+                                 int n_classes) nogil:
+    """Compute the prediction given sparse sample x and dense weight w.
+
+    Parameters
+    ----------
+    x_data_ptr : pointer
+        Pointer to the data of the sample x
+
+    x_ind_ptr : pointer
+        Pointer to the indices of the sample  x
+
+    xnnz : int
+        Number of non-zero element in the sample  x
+
+    w_data_ptr : pointer
+        Pointer to the data of the weights w
+
+    wscale : float
+        Scale of the weights w
+
+    intercept : pointer
+        Pointer to the intercept
+
+    prediction : pointer
+        Pointer to store the resulting prediction
+
+    n_classes : int
+        Number of classes in multinomial case. Equals 1 in binary case.
+
+    """
+    cdef int feature_ind, class_ind, j
+    cdef float innerprod
+
+    for class_ind in range(n_classes):
+        innerprod = 0.0
+        # Compute the dot product only on non-zero elements of x
+        for j in range(xnnz):
+            feature_ind = x_ind_ptr[j]
+            innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] *
+                          x_data_ptr[j])
+
+        prediction[class_ind] = wscale * innerprod + intercept[class_ind]
+
+
+
+def _multinomial_grad_loss_all_samples(
+        SequentialDataset64 dataset,
+        np.ndarray[double, ndim=2, mode='c'] weights_array,
+        np.ndarray[double, ndim=1, mode='c'] intercept_array,
+        int n_samples, int n_features, int n_classes):
+    """Compute multinomial gradient and loss across all samples.
+
+    Used for testing purpose only.
+    """
+    cdef double* weights = <double * >weights_array.data
+    cdef double* intercept = <double * >intercept_array.data
+
+    cdef double *x_data_ptr = NULL
+    cdef int *x_ind_ptr = NULL
+    cdef int xnnz = -1
+    cdef double y
+    cdef double sample_weight
+
+    cdef double wscale = 1.0
+    cdef int i, j, class_ind, feature_ind
+    cdef double val
+    cdef double sum_loss = 0.0
+
+    cdef MultinomialLogLoss64 multiloss = MultinomialLogLoss64()
+
+    cdef np.ndarray[double, ndim=2] sum_gradient_array = \
+        np.zeros((n_features, n_classes), dtype=np.double, order="c")
+    cdef double* sum_gradient = <double*> sum_gradient_array.data
+
+    cdef np.ndarray[double, ndim=1] prediction_array = \
+        np.zeros(n_classes, dtype=np.double, order="c")
+    cdef double* prediction = <double*> prediction_array.data
+
+    cdef np.ndarray[double, ndim=1] gradient_array = \
+        np.zeros(n_classes, dtype=np.double, order="c")
+    cdef double* gradient = <double*> gradient_array.data
+
+    with nogil:
+        for i in range(n_samples):
+            # get next sample on the dataset
+            dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz,
+                         &y, &sample_weight)
+
+            # prediction of the multinomial classifier for the sample
+            predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale,
+                           intercept, prediction, n_classes)
+
+            # compute the gradient for this sample, given the prediction
+            multiloss._dloss(prediction, y, n_classes, sample_weight, gradient)
+
+            # compute the loss for this sample, given the prediction
+            sum_loss += multiloss._loss(prediction, y, n_classes, sample_weight)
+
+            # update the sum of the gradient
+            for j in range(xnnz):
+                feature_ind = x_ind_ptr[j]
+                val = x_data_ptr[j]
+                for class_ind in range(n_classes):
+                    sum_gradient[feature_ind * n_classes + class_ind] += \
+                        gradient[class_ind] * val
+
+    return sum_loss, sum_gradient_array
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index e4339229c5b64..20cb6ed9404b1 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -16,6 +16,7 @@
 from ._ranking import roc_auc_score
 from ._ranking import roc_curve
 from ._ranking import top_k_accuracy_score
+from ._ranking import cumulative_gain_curve
 
 from ._classification import accuracy_score
 from ._classification import balanced_accuracy_score
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
new file mode 100644
index 0000000000000..73dabf40ab5e5
--- /dev/null
+++ b/sklearn/metrics/ranking.py
@@ -0,0 +1,1461 @@
+"""Metrics to assess performance on classification task given scores
+
+Functions named as ``*_score`` return a scalar value to maximize: the higher
+the better
+
+Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize:
+the lower the better
+"""
+
+# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
+#          Mathieu Blondel <mathieu@mblondel.org>
+#          Olivier Grisel <olivier.grisel@ensta.org>
+#          Arnaud Joly <a.joly@ulg.ac.be>
+#          Jochen Wersdorfer <jochen@wersdoerfer.de>
+#          Lars Buitinck
+#          Joel Nothman <joel.nothman@gmail.com>
+#          Noel Dawe <noel@dawe.me>
+# License: BSD 3 clause
+
+
+import warnings
+from functools import partial
+
+import numpy as np
+from scipy.sparse import csr_matrix
+from scipy.stats import rankdata
+
+from ..utils import assert_all_finite
+from ..utils import check_consistent_length
+from ..utils import column_or_1d, check_array
+from ..utils.multiclass import type_of_target
+from ..utils.extmath import stable_cumsum
+from ..utils.sparsefuncs import count_nonzero
+from ..exceptions import UndefinedMetricWarning
+from ..preprocessing import label_binarize
+from ..preprocessing.label import _encode
+
+from .base import _average_binary_score, _average_multiclass_ovo_score
+
+
+def auc(x, y):
+    """Compute Area Under the Curve (AUC) using the trapezoidal rule
+
+    This is a general function, given points on a curve.  For computing the
+    area under the ROC-curve, see :func:`roc_auc_score`.  For an alternative
+    way to summarize a precision-recall curve, see
+    :func:`average_precision_score`.
+
+    Parameters
+    ----------
+    x : array, shape = [n]
+        x coordinates. These must be either monotonic increasing or monotonic
+        decreasing.
+    y : array, shape = [n]
+        y coordinates.
+
+    Returns
+    -------
+    auc : float
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import metrics
+    >>> y = np.array([1, 1, 2, 2])
+    >>> pred = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
+    >>> metrics.auc(fpr, tpr)
+    0.75
+
+    See also
+    --------
+    roc_auc_score : Compute the area under the ROC curve
+    average_precision_score : Compute average precision from prediction scores
+    precision_recall_curve :
+        Compute precision-recall pairs for different probability thresholds
+    """
+    check_consistent_length(x, y)
+    x = column_or_1d(x)
+    y = column_or_1d(y)
+
+    if x.shape[0] < 2:
+        raise ValueError('At least 2 points are needed to compute'
+                         ' area under curve, but x.shape = %s' % x.shape)
+
+    direction = 1
+    dx = np.diff(x)
+    if np.any(dx < 0):
+        if np.all(dx <= 0):
+            direction = -1
+        else:
+            raise ValueError("x is neither increasing nor decreasing "
+                             ": {}.".format(x))
+
+    area = direction * np.trapz(y, x)
+    if isinstance(area, np.memmap):
+        # Reductions such as .sum used internally in np.trapz do not return a
+        # scalar by default for numpy.memmap instances contrary to
+        # regular numpy.ndarray instances.
+        area = area.dtype.type(area)
+    return area
+
+
+def average_precision_score(y_true, y_score, average="macro", pos_label=1,
+                            sample_weight=None):
+    """Compute average precision (AP) from prediction scores
+
+    AP summarizes a precision-recall curve as the weighted mean of precisions
+    achieved at each threshold, with the increase in recall from the previous
+    threshold used as the weight:
+
+    .. math::
+        \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n
+
+    where :math:`P_n` and :math:`R_n` are the precision and recall at the nth
+    threshold [1]_. This implementation is not interpolated and is different
+    from computing the area under the precision-recall curve with the
+    trapezoidal rule, which uses linear interpolation and can be too
+    optimistic.
+
+    Note: this implementation is restricted to the binary classification task
+    or multilabel classification task.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples] or [n_samples, n_classes]
+        True binary labels or binary label indicators.
+
+    y_score : array, shape = [n_samples] or [n_samples, n_classes]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+
+    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+        If ``None``, the scores for each class are returned. Otherwise,
+        this determines the type of averaging performed on the data:
+
+        ``'micro'``:
+            Calculate metrics globally by considering each element of the label
+            indicator matrix as a label.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label).
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average.
+
+        Will be ignored when ``y_true`` is binary.
+
+    pos_label : int or str (default=1)
+        The label of the positive class. Only applied to binary ``y_true``.
+        For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1.
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    Returns
+    -------
+    average_precision : float
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Average precision
+           <https://en.wikipedia.org/w/index.php?title=Information_retrieval&
+           oldid=793358396#Average_precision>`_
+
+    See also
+    --------
+    roc_auc_score : Compute the area under the ROC curve
+
+    precision_recall_curve :
+        Compute precision-recall pairs for different probability thresholds
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import average_precision_score
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> average_precision_score(y_true, y_scores)
+    0.83...
+
+    Notes
+    -----
+    .. versionchanged:: 0.19
+      Instead of linearly interpolating between operating points, precisions
+      are weighted by the change in recall since the last operating point.
+    """
+    def _binary_uninterpolated_average_precision(
+            y_true, y_score, pos_label=1, sample_weight=None):
+        precision, recall, _ = precision_recall_curve(
+            y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)
+        # Return the step function integral
+        # The following works because the last entry of precision is
+        # guaranteed to be 1, as returned by precision_recall_curve
+        return -np.sum(np.diff(recall) * np.array(precision)[:-1])
+
+    y_type = type_of_target(y_true)
+    if y_type == "multilabel-indicator" and pos_label != 1:
+        raise ValueError("Parameter pos_label is fixed to 1 for "
+                         "multilabel-indicator y_true. Do not set "
+                         "pos_label or set pos_label to 1.")
+    elif y_type == "binary":
+        present_labels = np.unique(y_true)
+        if len(present_labels) == 2 and pos_label not in present_labels:
+            raise ValueError("pos_label=%r is invalid. Set it to a label in "
+                             "y_true." % pos_label)
+    average_precision = partial(_binary_uninterpolated_average_precision,
+                                pos_label=pos_label)
+    return _average_binary_score(average_precision, y_true, y_score,
+                                 average, sample_weight=sample_weight)
+
+
+def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None):
+    """Binary roc auc score"""
+    if len(np.unique(y_true)) != 2:
+        raise ValueError("Only one class present in y_true. ROC AUC score "
+                         "is not defined in that case.")
+
+    fpr, tpr, _ = roc_curve(y_true, y_score,
+                            sample_weight=sample_weight)
+    if max_fpr is None or max_fpr == 1:
+        return auc(fpr, tpr)
+    if max_fpr <= 0 or max_fpr > 1:
+        raise ValueError("Expected max_fpr in range (0, 1], got: %r" % max_fpr)
+
+    # Add a single point at max_fpr by linear interpolation
+    stop = np.searchsorted(fpr, max_fpr, 'right')
+    x_interp = [fpr[stop - 1], fpr[stop]]
+    y_interp = [tpr[stop - 1], tpr[stop]]
+    tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp))
+    fpr = np.append(fpr[:stop], max_fpr)
+    partial_auc = auc(fpr, tpr)
+
+    # McClish correction: standardize result to be 0.5 if non-discriminant
+    # and 1 if maximal
+    min_area = 0.5 * max_fpr**2
+    max_area = max_fpr
+    return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area))
+
+
+def roc_auc_score(y_true, y_score, average="macro", sample_weight=None,
+                  max_fpr=None, multi_class="raise", labels=None):
+    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
+    from prediction scores.
+
+    Note: this implementation is restricted to the binary classification task
+    or multilabel classification task in label indicator format.
+
+    Read more in the :ref:`User Guide <roc_metrics>`.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples] or [n_samples, n_classes]
+        True binary labels or binary label indicators.
+        The multiclass case expects shape = [n_samples] and labels
+        with values in ``range(n_classes)``.
+
+    y_score : array, shape = [n_samples] or [n_samples, n_classes]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers). For binary
+        y_true, y_score is supposed to be the score of the class with greater
+        label. The multiclass case expects shape = [n_samples, n_classes]
+        where the scores correspond to probability estimates.
+
+    average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted']
+        If ``None``, the scores for each class are returned. Otherwise,
+        this determines the type of averaging performed on the data:
+        Note: multiclass ROC AUC currently only handles the 'macro' and
+        'weighted' averages.
+
+        ``'micro'``:
+            Calculate metrics globally by considering each element of the label
+            indicator matrix as a label.
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean.  This does not take label imbalance into account.
+        ``'weighted'``:
+            Calculate metrics for each label, and find their average, weighted
+            by support (the number of true instances for each label).
+        ``'samples'``:
+            Calculate metrics for each instance, and find their average.
+
+        Will be ignored when ``y_true`` is binary.
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    max_fpr : float > 0 and <= 1, optional
+        If not ``None``, the standardized partial AUC [3]_ over the range
+        [0, max_fpr] is returned. For the multiclass case, ``max_fpr``,
+        should be either equal to ``None`` or ``1.0`` as AUC ROC partial
+        computation currently is not supported for multiclass.
+
+    multi_class : string, 'ovr' or 'ovo', optional(default='raise')
+        Determines the type of multiclass configuration to use.
+        ``multi_class`` must be provided when ``y_true`` is multiclass.
+
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
+    labels : array, shape = [n_classes] or None, optional (default=None)
+        List of labels to index ``y_score`` used for multiclass. If ``None``,
+        the lexicon order of ``y_true`` is used to index ``y_score``.
+
+    Returns
+    -------
+    auc : float
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Receiver operating characteristic
+            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
+           Letters, 2006, 27(8):861-874.
+
+    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
+            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
+
+    See also
+    --------
+    average_precision_score : Area under the precision-recall curve
+
+    roc_curve : Compute Receiver operating characteristic (ROC) curve
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import roc_auc_score
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> roc_auc_score(y_true, y_scores)
+    0.75
+
+    """
+
+    y_type = type_of_target(y_true)
+    y_true = check_array(y_true, ensure_2d=False, dtype=None)
+    y_score = check_array(y_score, ensure_2d=False)
+
+    if y_type == "multiclass" or (y_type == "binary" and
+                                  y_score.ndim == 2 and
+                                  y_score.shape[1] > 2):
+        # do not support partial ROC computation for multiclass
+        if max_fpr is not None and max_fpr != 1.:
+            raise ValueError("Partial AUC computation not available in "
+                             "multiclass setting, 'max_fpr' must be"
+                             " set to `None`, received `max_fpr={0}` "
+                             "instead".format(max_fpr))
+        if multi_class == 'raise':
+            raise ValueError("multi_class must be in ('ovo', 'ovr')")
+        return _multiclass_roc_auc_score(y_true, y_score, labels,
+                                         multi_class, average, sample_weight)
+    elif y_type == "binary":
+        labels = np.unique(y_true)
+        y_true = label_binarize(y_true, labels)[:, 0]
+        return _average_binary_score(partial(_binary_roc_auc_score,
+                                             max_fpr=max_fpr),
+                                     y_true, y_score, average,
+                                     sample_weight=sample_weight)
+    else:  # multilabel-indicator
+        return _average_binary_score(partial(_binary_roc_auc_score,
+                                             max_fpr=max_fpr),
+                                     y_true, y_score, average,
+                                     sample_weight=sample_weight)
+
+
+def _multiclass_roc_auc_score(y_true, y_score, labels,
+                              multi_class, average, sample_weight):
+    """Multiclass roc auc score
+
+    Parameters
+    ----------
+    y_true : array-like, shape = (n_samples, )
+        True multiclass labels.
+
+    y_score : array-like, shape = (n_samples, n_classes)
+        Target scores corresponding to probability estimates of a sample
+        belonging to a particular class
+
+    labels : array, shape = [n_classes] or None, optional (default=None)
+        List of labels to index ``y_score`` used for multiclass. If ``None``,
+        the lexical order of ``y_true`` is used to index ``y_score``.
+
+    multi_class : string, 'ovr' or 'ovo'
+        Determines the type of multiclass configuration to use.
+        ``'ovr'``:
+            Calculate metrics for the multiclass case using the one-vs-rest
+            approach.
+        ``'ovo'``:
+            Calculate metrics for the multiclass case using the one-vs-one
+            approach.
+
+    average : 'macro' or 'weighted', optional (default='macro')
+        Determines the type of averaging performed on the pairwise binary
+        metric scores
+        ``'macro'``:
+            Calculate metrics for each label, and find their unweighted
+            mean. This does not take label imbalance into account. Classes
+            are assumed to be uniformly distributed.
+        ``'weighted'``:
+            Calculate metrics for each label, taking into account the
+            prevalence of the classes.
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    """
+    # validation of the input y_score
+    if not np.allclose(1, y_score.sum(axis=1)):
+        raise ValueError(
+            "Target scores need to be probabilities for multiclass "
+            "roc_auc, i.e. they should sum up to 1.0 over classes")
+
+    # validation for multiclass parameter specifications
+    average_options = ("macro", "weighted")
+    if average not in average_options:
+        raise ValueError("average must be one of {0} for "
+                         "multiclass problems".format(average_options))
+
+    multiclass_options = ("ovo", "ovr")
+    if multi_class not in multiclass_options:
+        raise ValueError("multi_class='{0}' is not supported "
+                         "for multiclass ROC AUC, multi_class must be "
+                         "in {1}".format(
+                                multi_class, multiclass_options))
+
+    if labels is not None:
+        labels = column_or_1d(labels)
+        classes = _encode(labels)
+        if len(classes) != len(labels):
+            raise ValueError("Parameter 'labels' must be unique")
+        if not np.array_equal(classes, labels):
+            raise ValueError("Parameter 'labels' must be ordered")
+        if len(classes) != y_score.shape[1]:
+            raise ValueError(
+                "Number of given labels, {0}, not equal to the number "
+                "of columns in 'y_score', {1}".format(
+                    len(classes), y_score.shape[1]))
+        if len(np.setdiff1d(y_true, classes)):
+            raise ValueError(
+                "'y_true' contains labels not in parameter 'labels'")
+    else:
+        classes = _encode(y_true)
+        if len(classes) != y_score.shape[1]:
+            raise ValueError(
+                "Number of classes in y_true not equal to the number of "
+                "columns in 'y_score'")
+
+    if multi_class == "ovo":
+        if sample_weight is not None:
+            raise ValueError("sample_weight is not supported "
+                             "for multiclass one-vs-one ROC AUC, "
+                             "'sample_weight' must be None in this case.")
+        _, y_true_encoded = _encode(y_true, uniques=classes, encode=True)
+        # Hand & Till (2001) implementation (ovo)
+        return _average_multiclass_ovo_score(_binary_roc_auc_score,
+                                             y_true_encoded,
+                                             y_score, average=average)
+    else:
+        # ovr is same as multi-label
+        y_true_multilabel = label_binarize(y_true, classes)
+        return _average_binary_score(_binary_roc_auc_score, y_true_multilabel,
+                                     y_score, average,
+                                     sample_weight=sample_weight)
+
+
+def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None):
+    """Calculate true and false positives per binary classification threshold.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True targets of binary classification
+
+    y_score : array, shape = [n_samples]
+        Estimated probabilities or decision function
+
+    pos_label : int or str, default=None
+        The label of the positive class
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    Returns
+    -------
+    fps : array, shape = [n_thresholds]
+        A count of false positives, at index i being the number of negative
+        samples assigned a score >= thresholds[i]. The total number of
+        negative samples is equal to fps[-1] (thus true negatives are given by
+        fps[-1] - fps).
+
+    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
+        An increasing count of true positives, at index i being the number
+        of positive samples assigned a score >= thresholds[i]. The total
+        number of positive samples is equal to tps[-1] (thus false negatives
+        are given by tps[-1] - tps).
+
+    thresholds : array, shape = [n_thresholds]
+        Decreasing score values.
+    """
+    # Check to make sure y_true is valid
+    y_type = type_of_target(y_true)
+    if not (y_type == "binary" or
+            (y_type == "multiclass" and pos_label is not None)):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    check_consistent_length(y_true, y_score, sample_weight)
+    y_true = column_or_1d(y_true)
+    y_score = column_or_1d(y_score)
+    assert_all_finite(y_true)
+    assert_all_finite(y_score)
+
+    if sample_weight is not None:
+        sample_weight = column_or_1d(sample_weight)
+
+    # ensure binary classification if pos_label is not specified
+    classes = np.unique(y_true)
+    if (pos_label is None and
+        not (np.array_equal(classes, [0, 1]) or
+             np.array_equal(classes, [-1, 1]) or
+             np.array_equal(classes, [0]) or
+             np.array_equal(classes, [-1]) or
+             np.array_equal(classes, [1]))):
+        raise ValueError("Data is not binary and pos_label is not specified")
+    elif pos_label is None:
+        pos_label = 1.
+
+    # make y_true a boolean vector
+    y_true = (y_true == pos_label)
+
+    # sort scores and corresponding truth values
+    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
+    y_score = y_score[desc_score_indices]
+    y_true = y_true[desc_score_indices]
+    if sample_weight is not None:
+        weight = sample_weight[desc_score_indices]
+    else:
+        weight = 1.
+
+    # y_score typically has many tied values. Here we extract
+    # the indices associated with the distinct values. We also
+    # concatenate a value for the end of the curve.
+    distinct_value_indices = np.where(np.diff(y_score))[0]
+    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
+
+    # accumulate the true positives with decreasing threshold
+    tps = stable_cumsum(y_true * weight)[threshold_idxs]
+    if sample_weight is not None:
+        # express fps as a cumsum to ensure fps is increasing even in
+        # the presence of floating point errors
+        fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs]
+    else:
+        fps = 1 + threshold_idxs - tps
+    return fps, tps, y_score[threshold_idxs]
+
+
+def precision_recall_curve(y_true, probas_pred, pos_label=None,
+                           sample_weight=None):
+    """Compute precision-recall pairs for different probability thresholds
+
+    Note: this implementation is restricted to the binary classification task.
+
+    The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
+    true positives and ``fp`` the number of false positives. The precision is
+    intuitively the ability of the classifier not to label as positive a sample
+    that is negative.
+
+    The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
+    true positives and ``fn`` the number of false negatives. The recall is
+    intuitively the ability of the classifier to find all the positive samples.
+
+    The last precision and recall values are 1. and 0. respectively and do not
+    have a corresponding threshold.  This ensures that the graph starts on the
+    y axis.
+
+    Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
+        pos_label should be explicitly given.
+
+    probas_pred : array, shape = [n_samples]
+        Estimated probabilities or decision function.
+
+    pos_label : int or str, default=None
+        The label of the positive class.
+        When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
+        ``pos_label`` is set to 1, otherwise an error will be raised.
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    Returns
+    -------
+    precision : array, shape = [n_thresholds + 1]
+        Precision values such that element i is the precision of
+        predictions with score >= thresholds[i] and the last element is 1.
+
+    recall : array, shape = [n_thresholds + 1]
+        Decreasing recall values such that element i is the recall of
+        predictions with score >= thresholds[i] and the last element is 0.
+
+    thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))]
+        Increasing thresholds on the decision function used to compute
+        precision and recall.
+
+    See also
+    --------
+    average_precision_score : Compute average precision from prediction scores
+
+    roc_curve : Compute Receiver operating characteristic (ROC) curve
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import precision_recall_curve
+    >>> y_true = np.array([0, 0, 1, 1])
+    >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> precision, recall, thresholds = precision_recall_curve(
+    ...     y_true, y_scores)
+    >>> precision
+    array([0.66666667, 0.5       , 1.        , 1.        ])
+    >>> recall
+    array([1. , 0.5, 0.5, 0. ])
+    >>> thresholds
+    array([0.35, 0.4 , 0.8 ])
+
+    """
+    fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred,
+                                             pos_label=pos_label,
+                                             sample_weight=sample_weight)
+
+    precision = tps / (tps + fps)
+    precision[np.isnan(precision)] = 0
+    recall = tps / tps[-1]
+
+    # stop when full recall attained
+    # and reverse the outputs so recall is decreasing
+    last_ind = tps.searchsorted(tps[-1])
+    sl = slice(last_ind, None, -1)
+    return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
+
+
+def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
+              drop_intermediate=True):
+    """Compute Receiver operating characteristic (ROC)
+
+    Note: this implementation is restricted to the binary classification task.
+
+    Read more in the :ref:`User Guide <roc_metrics>`.
+
+    Parameters
+    ----------
+
+    y_true : array, shape = [n_samples]
+        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
+        pos_label should be explicitly given.
+
+    y_score : array, shape = [n_samples]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+
+    pos_label : int or str, default=None
+        The label of the positive class.
+        When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1},
+        ``pos_label`` is set to 1, otherwise an error will be raised.
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    drop_intermediate : boolean, optional (default=True)
+        Whether to drop some suboptimal thresholds which would not appear
+        on a plotted ROC curve. This is useful in order to create lighter
+        ROC curves.
+
+        .. versionadded:: 0.17
+           parameter *drop_intermediate*.
+
+    Returns
+    -------
+    fpr : array, shape = [>2]
+        Increasing false positive rates such that element i is the false
+        positive rate of predictions with score >= thresholds[i].
+
+    tpr : array, shape = [>2]
+        Increasing true positive rates such that element i is the true
+        positive rate of predictions with score >= thresholds[i].
+
+    thresholds : array, shape = [n_thresholds]
+        Decreasing thresholds on the decision function used to compute
+        fpr and tpr. `thresholds[0]` represents no instances being predicted
+        and is arbitrarily set to `max(y_score) + 1`.
+
+    See also
+    --------
+    roc_auc_score : Compute the area under the ROC curve
+
+    Notes
+    -----
+    Since the thresholds are sorted from low to high values, they
+    are reversed upon returning them to ensure they correspond to both ``fpr``
+    and ``tpr``, which are sorted in reversed order during their calculation.
+
+    References
+    ----------
+    .. [1] `Wikipedia entry for the Receiver operating characteristic
+            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
+
+    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
+           Letters, 2006, 27(8):861-874.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import metrics
+    >>> y = np.array([1, 1, 2, 2])
+    >>> scores = np.array([0.1, 0.4, 0.35, 0.8])
+    >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
+    >>> fpr
+    array([0. , 0. , 0.5, 0.5, 1. ])
+    >>> tpr
+    array([0. , 0.5, 0.5, 1. , 1. ])
+    >>> thresholds
+    array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ])
+
+    """
+    fps, tps, thresholds = _binary_clf_curve(
+        y_true, y_score, pos_label=pos_label, sample_weight=sample_weight)
+
+    # Attempt to drop thresholds corresponding to points in between and
+    # collinear with other points. These are always suboptimal and do not
+    # appear on a plotted ROC curve (and thus do not affect the AUC).
+    # Here np.diff(_, 2) is used as a "second derivative" to tell if there
+    # is a corner at the point. Both fps and tps must be tested to handle
+    # thresholds with multiple data points (which are combined in
+    # _binary_clf_curve). This keeps all cases where the point should be kept,
+    # but does not drop more complicated cases like fps = [1, 3, 7],
+    # tps = [1, 2, 4]; there is no harm in keeping too many thresholds.
+    if drop_intermediate and len(fps) > 2:
+        optimal_idxs = np.where(np.r_[True,
+                                      np.logical_or(np.diff(fps, 2),
+                                                    np.diff(tps, 2)),
+                                      True])[0]
+        fps = fps[optimal_idxs]
+        tps = tps[optimal_idxs]
+        thresholds = thresholds[optimal_idxs]
+
+    # Add an extra threshold position
+    # to make sure that the curve starts at (0, 0)
+    tps = np.r_[0, tps]
+    fps = np.r_[0, fps]
+    thresholds = np.r_[thresholds[0] + 1, thresholds]
+
+    if fps[-1] <= 0:
+        warnings.warn("No negative samples in y_true, "
+                      "false positive value should be meaningless",
+                      UndefinedMetricWarning)
+        fpr = np.repeat(np.nan, fps.shape)
+    else:
+        fpr = fps / fps[-1]
+
+    if tps[-1] <= 0:
+        warnings.warn("No positive samples in y_true, "
+                      "true positive value should be meaningless",
+                      UndefinedMetricWarning)
+        tpr = np.repeat(np.nan, tps.shape)
+    else:
+        tpr = tps / tps[-1]
+
+    return fpr, tpr, thresholds
+
+
+def cumulative_gain_curve(y_true, y_score, pos_label=None):
+    """Compute Cumulative Gain for each ten percent of the sample
+    Note: This implementation is restricted to the binary classification task.
+    
+    Parameters
+    ----------
+
+        y_true (array-like, shape (n_samples)): True labels of the data.
+        y_score (array-like, shape (n_samples)): Target scores, can either be
+            probability estimates of the positive class, confidence values, or
+            non-thresholded measure of decisions (as returned by
+            decision_function on some classifiers).
+        pos_label (int or str, default=None): Label considered as positive and
+            others are considered negative
+    Returns
+    -------
+        percentages (numpy.ndarray): An array containing the X-axis values for
+            plotting the Cumulative Gains chart.
+        gains (numpy.ndarray): An array containing the Y-axis values for one
+            curve of the Cumulative Gains chart.
+    Raises:
+        ValueError: If `y_true` is not composed of 2 classes. The Cumulative
+            Gain Chart is only relevant in binary classification.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn import metrics
+    >>> y_true = [0, 1, 1, 0, 0, 0, 1, 1, 0, 0]
+    >>> y_pred = [0.1, 0.8, 0.9, 0,3, 0.4, 0.6, 0.6, 0.6, 0.44]
+    >>> percentages, gains = metrics.cumulative_gain_curve(y_true, y_pred, pos_label=1)
+    >>> percentages
+    array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
+    >>> gains
+    array([0.  , 0.  , 0.25, 0.5 , 0.5 , 0.75, 1.  , 1.  , 1.  , 1.  , 1.  ]
+
+    """
+    y_true, y_score = np.asarray(y_true), np.asarray(y_score)
+
+    # ensure binary classification if pos_label is not specified
+    classes = np.unique(y_true)
+    if (pos_label is None and
+        not (np.array_equal(classes, [0, 1]) or
+             np.array_equal(classes, [-1, 1]) or
+             np.array_equal(classes, [0]) or
+             np.array_equal(classes, [-1]) or
+             np.array_equal(classes, [1]))):
+        raise ValueError("Data is not binary and pos_label is not specified")
+    elif pos_label is None:
+        pos_label = 1.
+
+    # make y_true a boolean vector
+    y_true = (y_true == pos_label)
+
+    sorted_indices = np.argsort(y_score)[::-1]
+    y_true = y_true[sorted_indices]
+    gains = np.cumsum(y_true)
+
+    percentages = np.arange(start=1, stop=len(y_true) + 1)
+
+    gains = gains / float(np.sum(y_true))
+    percentages = percentages / float(len(y_true))
+
+    gains = np.insert(gains, 0, [0])
+    percentages = np.insert(percentages, 0, [0])
+
+    return percentages, gains
+
+
+def label_ranking_average_precision_score(y_true, y_score, sample_weight=None):
+    """Compute ranking-based average precision
+
+    Label ranking average precision (LRAP) is the average over each ground
+    truth label assigned to each sample, of the ratio of true vs. total
+    labels with lower score.
+
+    This metric is used in multilabel ranking problem, where the goal
+    is to give better rank to the labels associated to each sample.
+
+    The obtained score is always strictly greater than 0 and
+    the best value is 1.
+
+    Read more in the :ref:`User Guide <label_ranking_average_precision>`.
+
+    Parameters
+    ----------
+    y_true : array or sparse matrix, shape = [n_samples, n_labels]
+        True binary labels in binary indicator format.
+
+    y_score : array, shape = [n_samples, n_labels]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    Returns
+    -------
+    score : float
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> from sklearn.metrics import label_ranking_average_precision_score
+    >>> y_true = np.array([[1, 0, 0], [0, 0, 1]])
+    >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]])
+    >>> label_ranking_average_precision_score(y_true, y_score)
+    0.416...
+
+    """
+    check_consistent_length(y_true, y_score, sample_weight)
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
+
+    if y_true.shape != y_score.shape:
+        raise ValueError("y_true and y_score have different shape")
+
+    # Handle badly formatted array and the degenerate case with one label
+    y_type = type_of_target(y_true)
+    if (y_type != "multilabel-indicator" and
+            not (y_type == "binary" and y_true.ndim == 2)):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    y_true = csr_matrix(y_true)
+    y_score = -y_score
+
+    n_samples, n_labels = y_true.shape
+
+    out = 0.
+    for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):
+        relevant = y_true.indices[start:stop]
+
+        if (relevant.size == 0 or relevant.size == n_labels):
+            # If all labels are relevant or unrelevant, the score is also
+            # equal to 1. The label ranking has no meaning.
+            aux = 1.
+        else:
+            scores_i = y_score[i]
+            rank = rankdata(scores_i, 'max')[relevant]
+            L = rankdata(scores_i[relevant], 'max')
+            aux = (L / rank).mean()
+
+        if sample_weight is not None:
+            aux = aux * sample_weight[i]
+        out += aux
+
+    if sample_weight is None:
+        out /= n_samples
+    else:
+        out /= np.sum(sample_weight)
+
+    return out
+
+
+def coverage_error(y_true, y_score, sample_weight=None):
+    """Coverage error measure
+
+    Compute how far we need to go through the ranked scores to cover all
+    true labels. The best value is equal to the average number
+    of labels in ``y_true`` per sample.
+
+    Ties in ``y_scores`` are broken by giving maximal rank that would have
+    been assigned to all tied values.
+
+    Note: Our implementation's score is 1 greater than the one given in
+    Tsoumakas et al., 2010. This extends it to handle the degenerate case
+    in which an instance has 0 true labels.
+
+    Read more in the :ref:`User Guide <coverage_error>`.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples, n_labels]
+        True binary labels in binary indicator format.
+
+    y_score : array, shape = [n_samples, n_labels]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    Returns
+    -------
+    coverage_error : float
+
+    References
+    ----------
+    .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).
+           Mining multi-label data. In Data mining and knowledge discovery
+           handbook (pp. 667-685). Springer US.
+
+    """
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
+    check_consistent_length(y_true, y_score, sample_weight)
+
+    y_type = type_of_target(y_true)
+    if y_type != "multilabel-indicator":
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    if y_true.shape != y_score.shape:
+        raise ValueError("y_true and y_score have different shape")
+
+    y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true))
+    y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1))
+    coverage = (y_score >= y_min_relevant).sum(axis=1)
+    coverage = coverage.filled(0)
+
+    return np.average(coverage, weights=sample_weight)
+
+
+def label_ranking_loss(y_true, y_score, sample_weight=None):
+    """Compute Ranking loss measure
+
+    Compute the average number of label pairs that are incorrectly ordered
+    given y_score weighted by the size of the label set and the number of
+    labels not in the label set.
+
+    This is similar to the error set size, but weighted by the number of
+    relevant and irrelevant labels. The best performance is achieved with
+    a ranking loss of zero.
+
+    Read more in the :ref:`User Guide <label_ranking_loss>`.
+
+    .. versionadded:: 0.17
+       A function *label_ranking_loss*
+
+    Parameters
+    ----------
+    y_true : array or sparse matrix, shape = [n_samples, n_labels]
+        True binary labels in binary indicator format.
+
+    y_score : array, shape = [n_samples, n_labels]
+        Target scores, can either be probability estimates of the positive
+        class, confidence values, or non-thresholded measure of decisions
+        (as returned by "decision_function" on some classifiers).
+
+    sample_weight : array-like of shape = [n_samples], optional
+        Sample weights.
+
+    Returns
+    -------
+    loss : float
+
+    References
+    ----------
+    .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010).
+           Mining multi-label data. In Data mining and knowledge discovery
+           handbook (pp. 667-685). Springer US.
+
+    """
+    y_true = check_array(y_true, ensure_2d=False, accept_sparse='csr')
+    y_score = check_array(y_score, ensure_2d=False)
+    check_consistent_length(y_true, y_score, sample_weight)
+
+    y_type = type_of_target(y_true)
+    if y_type not in ("multilabel-indicator",):
+        raise ValueError("{0} format is not supported".format(y_type))
+
+    if y_true.shape != y_score.shape:
+        raise ValueError("y_true and y_score have different shape")
+
+    n_samples, n_labels = y_true.shape
+
+    y_true = csr_matrix(y_true)
+
+    loss = np.zeros(n_samples)
+    for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])):
+        # Sort and bin the label scores
+        unique_scores, unique_inverse = np.unique(y_score[i],
+                                                  return_inverse=True)
+        true_at_reversed_rank = np.bincount(
+            unique_inverse[y_true.indices[start:stop]],
+            minlength=len(unique_scores))
+        all_at_reversed_rank = np.bincount(unique_inverse,
+                                        minlength=len(unique_scores))
+        false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
+
+        # if the scores are ordered, it's possible to count the number of
+        # incorrectly ordered paires in linear time by cumulatively counting
+        # how many false labels of a given score have a score higher than the
+        # accumulated true labels with lower score.
+        loss[i] = np.dot(true_at_reversed_rank.cumsum(),
+                         false_at_reversed_rank)
+
+    n_positives = count_nonzero(y_true, axis=1)
+    with np.errstate(divide="ignore", invalid="ignore"):
+        loss /= ((n_labels - n_positives) * n_positives)
+
+    # When there is no positive or no negative labels, those values should
+    # be consider as correct, i.e. the ranking doesn't matter.
+    loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0.
+
+    return np.average(loss, weights=sample_weight)
+
+
+def _dcg_sample_scores(y_true, y_score, k=None,
+                       log_base=2, ignore_ties=False):
+    """Compute Discounted Cumulative Gain.
+
+    Sum the true scores ranked in the order induced by the predicted scores,
+    after applying a logarithmic discount.
+
+    This ranking metric yields a high value if true labels are ranked high by
+    ``y_score``.
+
+    Parameters
+    ----------
+    y_true : ndarray, shape (n_samples, n_labels)
+        True targets of multilabel classification, or true scores of entities
+        to be ranked.
+
+    y_score : ndarray, shape (n_samples, n_labels)
+        Target scores, can either be probability estimates, confidence values,
+        or non-thresholded measure of decisions (as returned by
+        "decision_function" on some classifiers).
+
+    k : int, optional (default=None)
+        Only consider the highest k scores in the ranking. If None, use all
+        outputs.
+
+    log_base : float, optional (default=2)
+        Base of the logarithm used for the discount. A low value means a
+        sharper discount (top results are more important).
+
+    ignore_ties : bool, optional (default=False)
+        Assume that there are no ties in y_score (which is likely to be the
+        case if y_score is continuous) for efficiency gains.
+
+    Returns
+    -------
+    discounted_cumulative_gain : ndarray, shape (n_samples,)
+        The DCG score for each sample.
+
+    See also
+    --------
+    ndcg_score :
+        The Discounted Cumulative Gain divided by the Ideal Discounted
+        Cumulative Gain (the DCG obtained for a perfect ranking), in order to
+        have a score between 0 and 1.
+
+    """
+    discount = 1 / (np.log(np.arange(y_true.shape[1]) + 2) / np.log(log_base))
+    if k is not None:
+        discount[k:] = 0
+    if ignore_ties:
+        ranking = np.argsort(y_score)[:, ::-1]
+        ranked = y_true[np.arange(ranking.shape[0])[:, np.newaxis], ranking]
+        cumulative_gains = discount.dot(ranked.T)
+    else:
+        discount_cumsum = np.cumsum(discount)
+        cumulative_gains = [_tie_averaged_dcg(y_t, y_s, discount_cumsum)
+                            for y_t, y_s in zip(y_true, y_score)]
+        cumulative_gains = np.asarray(cumulative_gains)
+    return cumulative_gains
+
+
+def _tie_averaged_dcg(y_true, y_score, discount_cumsum):
+    """
+    Compute DCG by averaging over possible permutations of ties.
+
+    The gain (`y_true`) of an index falling inside a tied group (in the order
+    induced by `y_score`) is replaced by the average gain within this group.
+    The discounted gain for a tied group is then the average `y_true` within
+    this group times the sum of discounts of the corresponding ranks.
+
+    This amounts to averaging scores for all possible orderings of the tied
+    groups.
+
+    (note in the case of dcg@k the discount is 0 after index k)
+
+    Parameters
+    ----------
+    y_true : ndarray
+        The true relevance scores
+
+    y_score : ndarray
+        Predicted scores
+
+    discount_cumsum : ndarray
+        Precomputed cumulative sum of the discounts.
+
+    Returns
+    -------
+    The discounted cumulative gain.
+
+    References
+    ----------
+    McSherry, F., & Najork, M. (2008, March). Computing information retrieval
+    performance measures efficiently in the presence of tied scores. In
+    European conference on information retrieval (pp. 414-421). Springer,
+    Berlin, Heidelberg.
+
+    """
+    _, inv, counts = np.unique(
+        - y_score, return_inverse=True, return_counts=True)
+    ranked = np.zeros(len(counts))
+    np.add.at(ranked, inv, y_true)
+    ranked /= counts
+    groups = np.cumsum(counts) - 1
+    discount_sums = np.empty(len(counts))
+    discount_sums[0] = discount_cumsum[groups[0]]
+    discount_sums[1:] = np.diff(discount_cumsum[groups])
+    return (ranked * discount_sums).sum()
+
+
+def _check_dcg_target_type(y_true):
+    y_type = type_of_target(y_true)
+    supported_fmt = ("multilabel-indicator", "continuous-multioutput",
+                     "multiclass-multioutput")
+    if y_type not in supported_fmt:
+        raise ValueError(
+            "Only {} formats are supported. Got {} instead".format(
+                supported_fmt, y_type))
+
+
+def dcg_score(y_true, y_score, k=None,
+              log_base=2, sample_weight=None, ignore_ties=False):
+    """Compute Discounted Cumulative Gain.
+
+    Sum the true scores ranked in the order induced by the predicted scores,
+    after applying a logarithmic discount.
+
+    This ranking metric yields a high value if true labels are ranked high by
+    ``y_score``.
+
+    Usually the Normalized Discounted Cumulative Gain (NDCG, computed by
+    ndcg_score) is preferred.
+
+    Parameters
+    ----------
+    y_true : ndarray, shape (n_samples, n_labels)
+        True targets of multilabel classification, or true scores of entities
+        to be ranked.
+
+    y_score : ndarray, shape (n_samples, n_labels)
+        Target scores, can either be probability estimates, confidence values,
+        or non-thresholded measure of decisions (as returned by
+        "decision_function" on some classifiers).
+
+    k : int, optional (default=None)
+        Only consider the highest k scores in the ranking. If None, use all
+        outputs.
+
+    log_base : float, optional (default=2)
+        Base of the logarithm used for the discount. A low value means a
+        sharper discount (top results are more important).
+
+    sample_weight : ndarray, shape (n_samples,), optional (default=None)
+        Sample weights. If None, all samples are given the same weight.
+
+    ignore_ties : bool, optional (default=False)
+        Assume that there are no ties in y_score (which is likely to be the
+        case if y_score is continuous) for efficiency gains.
+
+    Returns
+    -------
+    discounted_cumulative_gain : float
+        The averaged sample DCG scores.
+
+    See also
+    --------
+    ndcg_score :
+        The Discounted Cumulative Gain divided by the Ideal Discounted
+        Cumulative Gain (the DCG obtained for a perfect ranking), in order to
+        have a score between 0 and 1.
+
+    References
+    ----------
+    `Wikipedia entry for Discounted Cumulative Gain
+        <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_
+
+    Jarvelin, K., & Kekalainen, J. (2002).
+    Cumulated gain-based evaluation of IR techniques. ACM Transactions on
+    Information Systems (TOIS), 20(4), 422-446.
+
+    Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).
+    A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th
+    Annual Conference on Learning Theory (COLT 2013)
+
+    McSherry, F., & Najork, M. (2008, March). Computing information retrieval
+    performance measures efficiently in the presence of tied scores. In
+    European conference on information retrieval (pp. 414-421). Springer,
+    Berlin, Heidelberg.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import dcg_score
+    >>> # we have groud-truth relevance of some answers to a query:
+    >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
+    >>> # we predict scores for the answers
+    >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
+    >>> dcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    9.49...
+    >>> # we can set k to truncate the sum; only top k answers contribute
+    >>> dcg_score(true_relevance, scores, k=2) # doctest: +ELLIPSIS
+    5.63...
+    >>> # now we have some ties in our prediction
+    >>> scores = np.asarray([[1, 0, 0, 0, 1]])
+    >>> # by default ties are averaged, so here we get the average true
+    >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5
+    >>> dcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS
+    7.5
+    >>> # we can choose to ignore ties for faster results, but only
+    >>> # if we know there aren't ties in our scores, otherwise we get
+    >>> # wrong results:
+    >>> dcg_score(true_relevance,
+    ...           scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS
+    5.0
+
+    """
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
+    check_consistent_length(y_true, y_score, sample_weight)
+    _check_dcg_target_type(y_true)
+    return np.average(
+        _dcg_sample_scores(
+            y_true, y_score, k=k, log_base=log_base,
+            ignore_ties=ignore_ties),
+        weights=sample_weight)
+
+
+def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False):
+    """Compute Normalized Discounted Cumulative Gain.
+
+    Sum the true scores ranked in the order induced by the predicted scores,
+    after applying a logarithmic discount. Then divide by the best possible
+    score (Ideal DCG, obtained for a perfect ranking) to obtain a score between
+    0 and 1.
+
+    This ranking metric yields a high value if true labels are ranked high by
+    ``y_score``.
+
+    Parameters
+    ----------
+    y_true : ndarray, shape (n_samples, n_labels)
+        True targets of multilabel classification, or true scores of entities
+        to be ranked.
+
+    y_score : ndarray, shape (n_samples, n_labels)
+        Target scores, can either be probability estimates, confidence values,
+        or non-thresholded measure of decisions (as returned by
+        "decision_function" on some classifiers).
+
+    k : int, optional (default=None)
+        Only consider the highest k scores in the ranking. If None, use all
+        outputs.
+
+    ignore_ties : bool, optional (default=False)
+        Assume that there are no ties in y_score (which is likely to be the
+        case if y_score is continuous) for efficiency gains.
+
+    Returns
+    -------
+    normalized_discounted_cumulative_gain : ndarray, shape (n_samples,)
+        The NDCG score for each sample (float in [0., 1.]).
+
+    See also
+    --------
+    dcg_score : Discounted Cumulative Gain (not normalized).
+
+    """
+    gain = _dcg_sample_scores(y_true, y_score, k, ignore_ties=ignore_ties)
+    # Here we use the order induced by y_true so we can ignore ties since
+    # the gain associated to tied indices is the same (permuting ties doesn't
+    # change the value of the re-ordered y_true)
+    normalizing_gain = _dcg_sample_scores(y_true, y_true, k, ignore_ties=True)
+    all_irrelevant = normalizing_gain == 0
+    gain[all_irrelevant] = 0
+    gain[~all_irrelevant] /= normalizing_gain[~all_irrelevant]
+    return gain
+
+
+def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False):
+    """Compute Normalized Discounted Cumulative Gain.
+
+    Sum the true scores ranked in the order induced by the predicted scores,
+    after applying a logarithmic discount. Then divide by the best possible
+    score (Ideal DCG, obtained for a perfect ranking) to obtain a score between
+    0 and 1.
+
+    This ranking metric yields a high value if true labels are ranked high by
+    ``y_score``.
+
+    Parameters
+    ----------
+    y_true : ndarray, shape (n_samples, n_labels)
+        True targets of multilabel classification, or true scores of entities
+        to be ranked.
+
+    y_score : ndarray, shape (n_samples, n_labels)
+        Target scores, can either be probability estimates, confidence values,
+        or non-thresholded measure of decisions (as returned by
+        "decision_function" on some classifiers).
+
+    k : int, optional (default=None)
+        Only consider the highest k scores in the ranking. If None, use all
+        outputs.
+
+    sample_weight : ndarray, shape (n_samples,), optional (default=None)
+        Sample weights. If None, all samples are given the same weight.
+
+    ignore_ties : bool, optional (default=False)
+        Assume that there are no ties in y_score (which is likely to be the
+        case if y_score is continuous) for efficiency gains.
+
+    Returns
+    -------
+    normalized_discounted_cumulative_gain : float in [0., 1.]
+        The averaged NDCG scores for all samples.
+
+    See also
+    --------
+    dcg_score : Discounted Cumulative Gain (not normalized).
+
+    References
+    ----------
+    `Wikipedia entry for Discounted Cumulative Gain
+        <https://en.wikipedia.org/wiki/Discounted_cumulative_gain>`_
+
+    Jarvelin, K., & Kekalainen, J. (2002).
+    Cumulated gain-based evaluation of IR techniques. ACM Transactions on
+    Information Systems (TOIS), 20(4), 422-446.
+
+    Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May).
+    A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th
+    Annual Conference on Learning Theory (COLT 2013)
+
+    McSherry, F., & Najork, M. (2008, March). Computing information retrieval
+    performance measures efficiently in the presence of tied scores. In
+    European conference on information retrieval (pp. 414-421). Springer,
+    Berlin, Heidelberg.
+
+    Examples
+    --------
+    >>> from sklearn.metrics import ndcg_score
+    >>> # we have groud-truth relevance of some answers to a query:
+    >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]])
+    >>> # we predict some scores (relevance) for the answers
+    >>> scores = np.asarray([[.1, .2, .3, 4, 70]])
+    >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    0.69...
+    >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]])
+    >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS
+    0.49...
+    >>> # we can set k to truncate the sum; only top k answers contribute.
+    >>> ndcg_score(true_relevance, scores, k=4) # doctest: +ELLIPSIS
+    0.35...
+    >>> # the normalization takes k into account so a perfect answer
+    >>> # would still get 1.0
+    >>> ndcg_score(true_relevance, true_relevance, k=4) # doctest: +ELLIPSIS
+    1.0
+    >>> # now we have some ties in our prediction
+    >>> scores = np.asarray([[1, 0, 0, 0, 1]])
+    >>> # by default ties are averaged, so here we get the average (normalized)
+    >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75
+    >>> ndcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS
+    0.75
+    >>> # we can choose to ignore ties for faster results, but only
+    >>> # if we know there aren't ties in our scores, otherwise we get
+    >>> # wrong results:
+    >>> ndcg_score(true_relevance,
+    ...           scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS
+    0.5
+
+    """
+    y_true = check_array(y_true, ensure_2d=False)
+    y_score = check_array(y_score, ensure_2d=False)
+    check_consistent_length(y_true, y_score, sample_weight)
+    _check_dcg_target_type(y_true)
+    gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties)
+    return np.average(gain, weights=sample_weight)
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py
index 519187ac162c7..8b46e10db683b 100644
--- a/sklearn/metrics/tests/test_ranking.py
+++ b/sklearn/metrics/tests/test_ranking.py
@@ -769,7 +769,11 @@ def test_binary_clf_curve_multiclass_error(curve_func):
         curve_func(y_true, y_pred)
 
 
-@pytest.mark.parametrize("curve_func", CURVE_FUNCS)
+@pytest.mark.parametrize("curve_func", [
+    precision_recall_curve,
+    roc_curve,
+    cumulative_gain_curve
+])
 def test_binary_clf_curve_implicit_pos_label(curve_func):
     # Check that using string class labels raises an informative
     # error for any supported string dtype:
diff --git a/sklearn/utils/seq_dataset.pxd b/sklearn/utils/seq_dataset.pxd
new file mode 100644
index 0000000000000..67ce3b68b4474
--- /dev/null
+++ b/sklearn/utils/seq_dataset.pxd
@@ -0,0 +1,116 @@
+
+#------------------------------------------------------------------------------
+
+"""
+Dataset abstractions for sequential data access.
+WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
+"""
+
+cimport numpy as np
+
+# SequentialDataset and its two concrete subclasses are (optionally randomized)
+# iterators over the rows of a matrix X and corresponding target values y.
+
+
+cdef class SequentialDataset64:
+    cdef int current_index
+    cdef np.ndarray index
+    cdef int *index_data_ptr
+    cdef Py_ssize_t n_samples
+    cdef np.uint32_t seed
+
+    cdef void shuffle(self, np.uint32_t seed) nogil
+    cdef int _get_next_index(self) nogil
+    cdef int _get_random_index(self) nogil
+
+    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, double *y, double *sample_weight,
+                      int current_index) nogil
+    cdef void next(self, double **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, double *y, double *sample_weight) nogil
+    cdef int random(self, double **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, double *y, double *sample_weight) nogil
+
+
+cdef class ArrayDataset64(SequentialDataset64):
+    cdef np.ndarray X
+    cdef np.ndarray Y
+    cdef np.ndarray sample_weights
+    cdef Py_ssize_t n_features
+    cdef np.npy_intp X_stride
+    cdef double *X_data_ptr
+    cdef double *Y_data_ptr
+    cdef np.ndarray feature_indices
+    cdef int *feature_indices_ptr
+    cdef double *sample_weight_data
+
+
+cdef class CSRDataset64(SequentialDataset64):
+    cdef np.ndarray X_data
+    cdef np.ndarray X_indptr
+    cdef np.ndarray X_indices
+    cdef np.ndarray Y
+    cdef np.ndarray sample_weights
+    cdef double *X_data_ptr
+    cdef int *X_indptr_ptr
+    cdef int *X_indices_ptr
+    cdef double *Y_data_ptr
+    cdef double *sample_weight_data
+
+#------------------------------------------------------------------------------
+
+"""
+Dataset abstractions for sequential data access.
+WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp
+"""
+
+cimport numpy as np
+
+# SequentialDataset and its two concrete subclasses are (optionally randomized)
+# iterators over the rows of a matrix X and corresponding target values y.
+
+
+cdef class SequentialDataset32:
+    cdef int current_index
+    cdef np.ndarray index
+    cdef int *index_data_ptr
+    cdef Py_ssize_t n_samples
+    cdef np.uint32_t seed
+
+    cdef void shuffle(self, np.uint32_t seed) nogil
+    cdef int _get_next_index(self) nogil
+    cdef int _get_random_index(self) nogil
+
+    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, float *y, float *sample_weight,
+                      int current_index) nogil
+    cdef void next(self, float **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, float *y, float *sample_weight) nogil
+    cdef int random(self, float **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, float *y, float *sample_weight) nogil
+
+
+cdef class ArrayDataset32(SequentialDataset32):
+    cdef np.ndarray X
+    cdef np.ndarray Y
+    cdef np.ndarray sample_weights
+    cdef Py_ssize_t n_features
+    cdef np.npy_intp X_stride
+    cdef float *X_data_ptr
+    cdef float *Y_data_ptr
+    cdef np.ndarray feature_indices
+    cdef int *feature_indices_ptr
+    cdef float *sample_weight_data
+
+
+cdef class CSRDataset32(SequentialDataset32):
+    cdef np.ndarray X_data
+    cdef np.ndarray X_indptr
+    cdef np.ndarray X_indices
+    cdef np.ndarray Y
+    cdef np.ndarray sample_weights
+    cdef float *X_data_ptr
+    cdef int *X_indptr_ptr
+    cdef int *X_indices_ptr
+    cdef float *Y_data_ptr
+    cdef float *sample_weight_data
diff --git a/sklearn/utils/seq_dataset.pyx b/sklearn/utils/seq_dataset.pyx
new file mode 100644
index 0000000000000..6fa274771defe
--- /dev/null
+++ b/sklearn/utils/seq_dataset.pyx
@@ -0,0 +1,653 @@
+# cython: cdivision=True
+# cython: boundscheck=False
+# cython: wraparound=False
+
+#------------------------------------------------------------------------------
+
+"""
+Dataset abstractions for sequential data access.
+WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+"""
+
+cimport cython
+from libc.limits cimport INT_MAX
+cimport numpy as np
+import numpy as np
+
+np.import_array()
+
+from ._random cimport our_rand_r
+
+cdef class SequentialDataset64:
+    """Base class for datasets with sequential data access.
+
+    SequentialDataset is used to iterate over the rows of a matrix X and
+    corresponding target values y, i.e. to iterate over samples.
+    There are two methods to get the next sample:
+        - next : Iterate sequentially (optionally randomized)
+        - random : Iterate randomly (with replacement)
+
+    Attributes
+    ----------
+    index : np.ndarray
+        Index array for fast shuffling.
+
+    index_data_ptr : int
+        Pointer to the index array.
+
+    current_index : int
+        Index of current sample in ``index``.
+        The index of current sample in the data is given by
+        index_data_ptr[current_index].
+
+    n_samples : Py_ssize_t
+        Number of samples in the dataset.
+
+    seed : np.uint32_t
+        Seed used for random sampling.
+
+    """
+
+    cdef void next(self, double **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, double *y, double *sample_weight) nogil:
+        """Get the next example ``x`` from the dataset.
+
+        This method gets the next sample looping sequentially over all samples.
+        The order can be shuffled with the method ``shuffle``.
+        Shuffling once before iterating over all samples corresponds to a
+        random draw without replacement. It is used for instance in SGD solver.
+
+        Parameters
+        ----------
+        x_data_ptr : double**
+            A pointer to the double array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : double*
+            The target value of the next example.
+
+        sample_weight : double*
+            The weight of the next example.
+        """
+        cdef int current_index = self._get_next_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+
+    cdef int random(self, double **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, double *y, double *sample_weight) nogil:
+        """Get a random example ``x`` from the dataset.
+
+        This method gets next sample chosen randomly over a uniform
+        distribution. It corresponds to a random draw with replacement.
+        It is used for instance in SAG solver.
+
+        Parameters
+        ----------
+        x_data_ptr : double**
+            A pointer to the double array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : double*
+            The target value of the next example.
+
+        sample_weight : double*
+            The weight of the next example.
+
+        Returns
+        -------
+        current_index : int
+            Index of current sample.
+        """
+        cdef int current_index = self._get_random_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+        return current_index
+
+    cdef void shuffle(self, np.uint32_t seed) nogil:
+        """Permutes the ordering of examples."""
+        # Fisher-Yates shuffle
+        cdef int *ind = self.index_data_ptr
+        cdef int n = self.n_samples
+        cdef unsigned i, j
+        for i in range(n - 1):
+            j = i + our_rand_r(&seed) % (n - i)
+            ind[i], ind[j] = ind[j], ind[i]
+
+    cdef int _get_next_index(self) nogil:
+        cdef int current_index = self.current_index
+        if current_index >= (self.n_samples - 1):
+            current_index = -1
+
+        current_index += 1
+        self.current_index = current_index
+        return self.current_index
+
+    cdef int _get_random_index(self) nogil:
+        cdef int n = self.n_samples
+        cdef int current_index = our_rand_r(&self.seed) % n
+        self.current_index = current_index
+        return current_index
+
+    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, double *y, double *sample_weight,
+                      int current_index) nogil:
+        pass
+
+    def _shuffle_py(self, np.uint32_t seed):
+        """python function used for easy testing"""
+        self.shuffle(seed)
+
+    def _next_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_next_index()
+        return self._sample_py(current_index)
+
+    def _random_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_random_index()
+        return self._sample_py(current_index)
+
+    def _sample_py(self, int current_index):
+        """python function used for easy testing"""
+        cdef double* x_data_ptr
+        cdef int* x_indices_ptr
+        cdef int nnz, j
+        cdef double y, sample_weight
+
+        # call _sample in cython
+        self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight,
+                     current_index)
+
+        # transform the pointed data in numpy CSR array
+        cdef np.ndarray[double, ndim=1] x_data = np.empty(nnz,
+                                                              dtype=np.float64)
+        cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32)
+        cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz],
+                                                           dtype=np.int32)
+
+        for j in range(nnz):
+            x_data[j] = x_data_ptr[j]
+            x_indices[j] = x_indices_ptr[j]
+
+        cdef int sample_idx = self.index_data_ptr[current_index]
+
+        return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx
+
+
+cdef class ArrayDataset64(SequentialDataset64):
+    """Dataset backed by a two-dimensional numpy array.
+
+    The dtype of the numpy array is expected to be ``np.float64`` (double)
+    and C-style memory layout.
+    """
+
+    def __cinit__(self, np.ndarray[double, ndim=2, mode='c'] X,
+                  np.ndarray[double, ndim=1, mode='c'] Y,
+                  np.ndarray[double, ndim=1, mode='c'] sample_weights,
+                  np.uint32_t seed=1):
+        """A ``SequentialDataset`` backed by a two-dimensional numpy array.
+
+        Parameters
+        ----------
+        X : ndarray, dtype=double, ndim=2, mode='c'
+            The sample array, of shape(n_samples, n_features)
+
+        Y : ndarray, dtype=double, ndim=1, mode='c'
+            The target array, of shape(n_samples, )
+
+        sample_weights : ndarray, dtype=double, ndim=1, mode='c'
+            The weight of each sample, of shape(n_samples,)
+        """
+        if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX:
+            raise ValueError("More than %d samples or features not supported;"
+                             " got (%d, %d)."
+                             % (INT_MAX, X.shape[0], X.shape[1]))
+
+        # keep a reference to the data to prevent garbage collection
+        self.X = X
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = X.shape[0]
+        self.n_features = X.shape[1]
+
+        cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \
+            np.arange(0, self.n_features, dtype=np.intc)
+        self.feature_indices = feature_indices
+        self.feature_indices_ptr = <int *> feature_indices.data
+
+        self.current_index = -1
+        self.X_stride = X.strides[0] / X.itemsize
+        self.X_data_ptr = <double *>X.data
+        self.Y_data_ptr = <double *>Y.data
+        self.sample_weight_data = <double *>sample_weights.data
+
+        # Use index array for fast shuffling
+        cdef np.ndarray[int, ndim=1, mode='c'] index = \
+            np.arange(0, self.n_samples, dtype=np.intc)
+        self.index = index
+        self.index_data_ptr = <int *>index.data
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, double *y, double *sample_weight,
+                      int current_index) nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = sample_idx * self.X_stride
+
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.feature_indices_ptr
+        nnz[0] = self.n_features
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+
+
+cdef class CSRDataset64(SequentialDataset64):
+    """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
+
+    def __cinit__(self, np.ndarray[double, ndim=1, mode='c'] X_data,
+                  np.ndarray[int, ndim=1, mode='c'] X_indptr,
+                  np.ndarray[int, ndim=1, mode='c'] X_indices,
+                  np.ndarray[double, ndim=1, mode='c'] Y,
+                  np.ndarray[double, ndim=1, mode='c'] sample_weights,
+                  np.uint32_t seed=1):
+        """Dataset backed by a scipy sparse CSR matrix.
+
+        The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
+        The corresponding feature values are given by
+        x_data_ptr[0:nnz].
+
+        Parameters
+        ----------
+        X_data : ndarray, dtype=double, ndim=1, mode='c'
+            The data array of the CSR features matrix.
+
+        X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The index pointer array of the CSR features matrix.
+
+        X_indices : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The column indices array of the CSR features matrix.
+
+        Y : ndarray, dtype=double, ndim=1, mode='c'
+            The target values.
+
+        sample_weights : ndarray, dtype=double, ndim=1, mode='c'
+            The weight of each sample.
+        """
+        # keep a reference to the data to prevent garbage collection
+        self.X_data = X_data
+        self.X_indptr = X_indptr
+        self.X_indices = X_indices
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = Y.shape[0]
+        self.current_index = -1
+        self.X_data_ptr = <double *>X_data.data
+        self.X_indptr_ptr = <int *>X_indptr.data
+        self.X_indices_ptr = <int *>X_indices.data
+
+        self.Y_data_ptr = <double *>Y.data
+        self.sample_weight_data = <double *>sample_weights.data
+
+        # Use index array for fast shuffling
+        cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples,
+                                                               dtype=np.intc)
+        self.index = idx
+        self.index_data_ptr = <int *>idx.data
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, double *y, double *sample_weight,
+                      int current_index) nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = self.X_indptr_ptr[sample_idx]
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.X_indices_ptr + offset
+        nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+
+
+#------------------------------------------------------------------------------
+
+"""
+Dataset abstractions for sequential data access.
+WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp
+"""
+
+cimport cython
+from libc.limits cimport INT_MAX
+cimport numpy as np
+import numpy as np
+
+np.import_array()
+
+from ._random cimport our_rand_r
+
+cdef class SequentialDataset32:
+    """Base class for datasets with sequential data access.
+
+    SequentialDataset is used to iterate over the rows of a matrix X and
+    corresponding target values y, i.e. to iterate over samples.
+    There are two methods to get the next sample:
+        - next : Iterate sequentially (optionally randomized)
+        - random : Iterate randomly (with replacement)
+
+    Attributes
+    ----------
+    index : np.ndarray
+        Index array for fast shuffling.
+
+    index_data_ptr : int
+        Pointer to the index array.
+
+    current_index : int
+        Index of current sample in ``index``.
+        The index of current sample in the data is given by
+        index_data_ptr[current_index].
+
+    n_samples : Py_ssize_t
+        Number of samples in the dataset.
+
+    seed : np.uint32_t
+        Seed used for random sampling.
+
+    """
+
+    cdef void next(self, float **x_data_ptr, int **x_ind_ptr,
+                   int *nnz, float *y, float *sample_weight) nogil:
+        """Get the next example ``x`` from the dataset.
+
+        This method gets the next sample looping sequentially over all samples.
+        The order can be shuffled with the method ``shuffle``.
+        Shuffling once before iterating over all samples corresponds to a
+        random draw without replacement. It is used for instance in SGD solver.
+
+        Parameters
+        ----------
+        x_data_ptr : float**
+            A pointer to the float array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : float*
+            The target value of the next example.
+
+        sample_weight : float*
+            The weight of the next example.
+        """
+        cdef int current_index = self._get_next_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+
+    cdef int random(self, float **x_data_ptr, int **x_ind_ptr,
+                    int *nnz, float *y, float *sample_weight) nogil:
+        """Get a random example ``x`` from the dataset.
+
+        This method gets next sample chosen randomly over a uniform
+        distribution. It corresponds to a random draw with replacement.
+        It is used for instance in SAG solver.
+
+        Parameters
+        ----------
+        x_data_ptr : float**
+            A pointer to the float array which holds the feature
+            values of the next example.
+
+        x_ind_ptr : np.intc**
+            A pointer to the int array which holds the feature
+            indices of the next example.
+
+        nnz : int*
+            A pointer to an int holding the number of non-zero
+            values of the next example.
+
+        y : float*
+            The target value of the next example.
+
+        sample_weight : float*
+            The weight of the next example.
+
+        Returns
+        -------
+        current_index : int
+            Index of current sample.
+        """
+        cdef int current_index = self._get_random_index()
+        self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight,
+                     current_index)
+        return current_index
+
+    cdef void shuffle(self, np.uint32_t seed) nogil:
+        """Permutes the ordering of examples."""
+        # Fisher-Yates shuffle
+        cdef int *ind = self.index_data_ptr
+        cdef int n = self.n_samples
+        cdef unsigned i, j
+        for i in range(n - 1):
+            j = i + our_rand_r(&seed) % (n - i)
+            ind[i], ind[j] = ind[j], ind[i]
+
+    cdef int _get_next_index(self) nogil:
+        cdef int current_index = self.current_index
+        if current_index >= (self.n_samples - 1):
+            current_index = -1
+
+        current_index += 1
+        self.current_index = current_index
+        return self.current_index
+
+    cdef int _get_random_index(self) nogil:
+        cdef int n = self.n_samples
+        cdef int current_index = our_rand_r(&self.seed) % n
+        self.current_index = current_index
+        return current_index
+
+    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, float *y, float *sample_weight,
+                      int current_index) nogil:
+        pass
+
+    def _shuffle_py(self, np.uint32_t seed):
+        """python function used for easy testing"""
+        self.shuffle(seed)
+
+    def _next_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_next_index()
+        return self._sample_py(current_index)
+
+    def _random_py(self):
+        """python function used for easy testing"""
+        cdef int current_index = self._get_random_index()
+        return self._sample_py(current_index)
+
+    def _sample_py(self, int current_index):
+        """python function used for easy testing"""
+        cdef float* x_data_ptr
+        cdef int* x_indices_ptr
+        cdef int nnz, j
+        cdef float y, sample_weight
+
+        # call _sample in cython
+        self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight,
+                     current_index)
+
+        # transform the pointed data in numpy CSR array
+        cdef np.ndarray[float, ndim=1] x_data = np.empty(nnz,
+                                                              dtype=np.float32)
+        cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32)
+        cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz],
+                                                           dtype=np.int32)
+
+        for j in range(nnz):
+            x_data[j] = x_data_ptr[j]
+            x_indices[j] = x_indices_ptr[j]
+
+        cdef int sample_idx = self.index_data_ptr[current_index]
+
+        return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx
+
+
+cdef class ArrayDataset32(SequentialDataset32):
+    """Dataset backed by a two-dimensional numpy array.
+
+    The dtype of the numpy array is expected to be ``np.float32`` (float)
+    and C-style memory layout.
+    """
+
+    def __cinit__(self, np.ndarray[float, ndim=2, mode='c'] X,
+                  np.ndarray[float, ndim=1, mode='c'] Y,
+                  np.ndarray[float, ndim=1, mode='c'] sample_weights,
+                  np.uint32_t seed=1):
+        """A ``SequentialDataset`` backed by a two-dimensional numpy array.
+
+        Parameters
+        ----------
+        X : ndarray, dtype=float, ndim=2, mode='c'
+            The sample array, of shape(n_samples, n_features)
+
+        Y : ndarray, dtype=float, ndim=1, mode='c'
+            The target array, of shape(n_samples, )
+
+        sample_weights : ndarray, dtype=float, ndim=1, mode='c'
+            The weight of each sample, of shape(n_samples,)
+        """
+        if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX:
+            raise ValueError("More than %d samples or features not supported;"
+                             " got (%d, %d)."
+                             % (INT_MAX, X.shape[0], X.shape[1]))
+
+        # keep a reference to the data to prevent garbage collection
+        self.X = X
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = X.shape[0]
+        self.n_features = X.shape[1]
+
+        cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \
+            np.arange(0, self.n_features, dtype=np.intc)
+        self.feature_indices = feature_indices
+        self.feature_indices_ptr = <int *> feature_indices.data
+
+        self.current_index = -1
+        self.X_stride = X.strides[0] / X.itemsize
+        self.X_data_ptr = <float *>X.data
+        self.Y_data_ptr = <float *>Y.data
+        self.sample_weight_data = <float *>sample_weights.data
+
+        # Use index array for fast shuffling
+        cdef np.ndarray[int, ndim=1, mode='c'] index = \
+            np.arange(0, self.n_samples, dtype=np.intc)
+        self.index = index
+        self.index_data_ptr = <int *>index.data
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, float *y, float *sample_weight,
+                      int current_index) nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = sample_idx * self.X_stride
+
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.feature_indices_ptr
+        nnz[0] = self.n_features
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+
+
+cdef class CSRDataset32(SequentialDataset32):
+    """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """
+
+    def __cinit__(self, np.ndarray[float, ndim=1, mode='c'] X_data,
+                  np.ndarray[int, ndim=1, mode='c'] X_indptr,
+                  np.ndarray[int, ndim=1, mode='c'] X_indices,
+                  np.ndarray[float, ndim=1, mode='c'] Y,
+                  np.ndarray[float, ndim=1, mode='c'] sample_weights,
+                  np.uint32_t seed=1):
+        """Dataset backed by a scipy sparse CSR matrix.
+
+        The feature indices of ``x`` are given by x_ind_ptr[0:nnz].
+        The corresponding feature values are given by
+        x_data_ptr[0:nnz].
+
+        Parameters
+        ----------
+        X_data : ndarray, dtype=float, ndim=1, mode='c'
+            The data array of the CSR features matrix.
+
+        X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The index pointer array of the CSR features matrix.
+
+        X_indices : ndarray, dtype=np.intc, ndim=1, mode='c'
+            The column indices array of the CSR features matrix.
+
+        Y : ndarray, dtype=float, ndim=1, mode='c'
+            The target values.
+
+        sample_weights : ndarray, dtype=float, ndim=1, mode='c'
+            The weight of each sample.
+        """
+        # keep a reference to the data to prevent garbage collection
+        self.X_data = X_data
+        self.X_indptr = X_indptr
+        self.X_indices = X_indices
+        self.Y = Y
+        self.sample_weights = sample_weights
+
+        self.n_samples = Y.shape[0]
+        self.current_index = -1
+        self.X_data_ptr = <float *>X_data.data
+        self.X_indptr_ptr = <int *>X_indptr.data
+        self.X_indices_ptr = <int *>X_indices.data
+
+        self.Y_data_ptr = <float *>Y.data
+        self.sample_weight_data = <float *>sample_weights.data
+
+        # Use index array for fast shuffling
+        cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples,
+                                                               dtype=np.intc)
+        self.index = idx
+        self.index_data_ptr = <int *>idx.data
+        # seed should not be 0 for our_rand_r
+        self.seed = max(seed, 1)
+
+    cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr,
+                      int *nnz, float *y, float *sample_weight,
+                      int current_index) nogil:
+        cdef long long sample_idx = self.index_data_ptr[current_index]
+        cdef long long offset = self.X_indptr_ptr[sample_idx]
+        y[0] = self.Y_data_ptr[sample_idx]
+        x_data_ptr[0] = self.X_data_ptr + offset
+        x_ind_ptr[0] = self.X_indices_ptr + offset
+        nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset
+        sample_weight[0] = self.sample_weight_data[sample_idx]
+

From 6e344a2be1d5ecd1ff6dc1a313dfc65453f8ea76 Mon Sep 17 00:00:00 2001
From: tber16_atl <atber@levi.com>
Date: Wed, 22 Dec 2021 10:49:59 +0100
Subject: [PATCH 3/6] Update upstream to the forked repo

---
 sklearn/metrics/__init__.py | 82 +++++++++++++++++++++++++++++++++++++
 sklearn/metrics/_ranking.py |  5 ++-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index 20cb6ed9404b1..f6295550770e1 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -97,6 +97,7 @@
 
 
 __all__ = [
+<<<<<<< HEAD
     "accuracy_score",
     "adjusted_mutual_info_score",
     "adjusted_rand_score",
@@ -177,4 +178,85 @@
     "v_measure_score",
     "zero_one_loss",
     "brier_score_loss",
+=======
+    'accuracy_score',
+    'adjusted_mutual_info_score',
+    'adjusted_rand_score',
+    'auc',
+    'average_precision_score',
+    'balanced_accuracy_score',
+    'calinski_harabasz_score',
+    'check_scoring',
+    'classification_report',
+    'cluster',
+    'cohen_kappa_score',
+    'completeness_score',
+    'ConfusionMatrixDisplay',
+    'confusion_matrix',
+    'consensus_score',
+    'coverage_error',
+    'dcg_score',
+    'davies_bouldin_score',
+    'DetCurveDisplay',
+    'det_curve',
+    'euclidean_distances',
+    'explained_variance_score',
+    'f1_score',
+    'fbeta_score',
+    'fowlkes_mallows_score',
+    'get_scorer',
+    'hamming_loss',
+    'hinge_loss',
+    'homogeneity_completeness_v_measure',
+    'homogeneity_score',
+    'jaccard_score',
+    'label_ranking_average_precision_score',
+    'label_ranking_loss',
+    'log_loss',
+    'make_scorer',
+    'nan_euclidean_distances',
+    'matthews_corrcoef',
+    'max_error',
+    'mean_absolute_error',
+    'mean_squared_error',
+    'mean_squared_log_error',
+    'mean_pinball_loss',
+    'mean_poisson_deviance',
+    'mean_gamma_deviance',
+    'mean_tweedie_deviance',
+    'median_absolute_error',
+    'mean_absolute_percentage_error',
+    'multilabel_confusion_matrix',
+    'mutual_info_score',
+    'ndcg_score',
+    'normalized_mutual_info_score',
+    'pair_confusion_matrix',
+    'pairwise_distances',
+    'pairwise_distances_argmin',
+    'pairwise_distances_argmin_min',
+    'pairwise_distances_chunked',
+    'pairwise_kernels',
+    'plot_confusion_matrix',
+    'plot_det_curve',
+    'plot_precision_recall_curve',
+    'plot_roc_curve',
+    'PrecisionRecallDisplay',
+    'precision_recall_curve',
+    'precision_recall_fscore_support',
+    'precision_score',
+    'r2_score',
+    'rand_score',
+    'recall_score',
+    'RocCurveDisplay',
+    'roc_auc_score',
+    'roc_curve',
+    'cumulative_gain_curve',
+    'SCORERS',
+    'silhouette_samples',
+    'silhouette_score',
+    'top_k_accuracy_score',
+    'v_measure_score',
+    'zero_one_loss',
+    'brier_score_loss',
+>>>>>>> fa66ae8c2 (FEA cumulative_gain_curve correct syntax)
 ]
diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index f3734b85ad261..cd2b4c704e701 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -1005,9 +1005,11 @@ def roc_curve(
         tpr = tps / tps[-1]
 
     return fpr, tpr, thresholds
+    
 
+@_deprecate_positional_args
 def cumulative_gain_curve(y_true, y_score, pos_label=None):
-    """This function generates the points necessary to plot the Cumulative Gain for each ten percent of the samples
+    """Compute Cumulative Gain for each ten percent of the sample
     Note: This implementation is restricted to the binary classification task.
     
     Parameters
@@ -1073,7 +1075,6 @@ def cumulative_gain_curve(y_true, y_score, pos_label=None):
     percentages = np.insert(percentages, 0, [0])
 
     return percentages, gains
- 
 
 
 def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None):

From d538225633a54d9f52f68fc7f6aee0463f41d7ac Mon Sep 17 00:00:00 2001
From: Ali TBER <ali.tber@ekimetrics.com>
Date: Tue, 2 Mar 2021 09:11:12 +0100
Subject: [PATCH 4/6] Another syntax correction

---
 sklearn/metrics/_ranking.py |  8 ++++----
 sklearn/metrics/ranking.py  | 11 ++++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index cd2b4c704e701..e60f0ab0d2a2e 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -1005,13 +1005,13 @@ def roc_curve(
         tpr = tps / tps[-1]
 
     return fpr, tpr, thresholds
-    
+
 
 @_deprecate_positional_args
 def cumulative_gain_curve(y_true, y_score, pos_label=None):
     """Compute Cumulative Gain for each ten percent of the sample
     Note: This implementation is restricted to the binary classification task.
-    
+
     Parameters
     ----------
 
@@ -1035,10 +1035,10 @@ def cumulative_gain_curve(y_true, y_score, pos_label=None):
     Examples
     --------
     >>> import numpy as np
-    >>> from sklearn import metrics
+    >>> from sklearn.metrics import cumulative_gain_curve
     >>> y_true = [0, 1, 1, 0, 0, 0, 1, 1, 0, 0]
     >>> y_pred = [0.1, 0.8, 0.9, 0,3, 0.4, 0.6, 0.6, 0.6, 0.44]
-    >>> percentages, gains = metrics.cumulative_gain_curve(y_true, y_pred, pos_label=1)
+    >>> percentages, gains = cumulative_gain_curve(y_true, y_pred, pos_label=1)
     >>> percentages
     array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
     >>> gains
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
index 73dabf40ab5e5..4889bfc4f3b86 100644
--- a/sklearn/metrics/ranking.py
+++ b/sklearn/metrics/ranking.py
@@ -787,7 +787,7 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None,
 def cumulative_gain_curve(y_true, y_score, pos_label=None):
     """Compute Cumulative Gain for each ten percent of the sample
     Note: This implementation is restricted to the binary classification task.
-    
+
     Parameters
     ----------
 
@@ -811,10 +811,10 @@ def cumulative_gain_curve(y_true, y_score, pos_label=None):
     Examples
     --------
     >>> import numpy as np
-    >>> from sklearn import metrics
+    >>> from sklearn.metrics import cumulative_gain_curve
     >>> y_true = [0, 1, 1, 0, 0, 0, 1, 1, 0, 0]
     >>> y_pred = [0.1, 0.8, 0.9, 0,3, 0.4, 0.6, 0.6, 0.6, 0.44]
-    >>> percentages, gains = metrics.cumulative_gain_curve(y_true, y_pred, pos_label=1)
+    >>> percentages, gains = cumulative_gain_curve(y_true, y_pred, pos_label=1)
     >>> percentages
     array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
     >>> gains
@@ -1061,8 +1061,9 @@ def label_ranking_loss(y_true, y_score, sample_weight=None):
         true_at_reversed_rank = np.bincount(
             unique_inverse[y_true.indices[start:stop]],
             minlength=len(unique_scores))
-        all_at_reversed_rank = np.bincount(unique_inverse,
-                                        minlength=len(unique_scores))
+        all_at_reversed_rank = np.bincount(
+            unique_inverse,
+            minlength=len(unique_scores))
         false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank
 
         # if the scores are ordered, it's possible to count the number of

From 772919fd6cc4ffe45f9521fa45cd162af4f82dea Mon Sep 17 00:00:00 2001
From: tber16_atl <atber@levi.com>
Date: Wed, 22 Dec 2021 11:04:43 +0100
Subject: [PATCH 5/6] Fix linting error

---
 sklearn/metrics/__init__.py | 85 +------------------------------------
 1 file changed, 1 insertion(+), 84 deletions(-)

diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
index f6295550770e1..a92f38151b71a 100644
--- a/sklearn/metrics/__init__.py
+++ b/sklearn/metrics/__init__.py
@@ -97,88 +97,6 @@
 
 
 __all__ = [
-<<<<<<< HEAD
-    "accuracy_score",
-    "adjusted_mutual_info_score",
-    "adjusted_rand_score",
-    "auc",
-    "average_precision_score",
-    "balanced_accuracy_score",
-    "calinski_harabasz_score",
-    "check_scoring",
-    "classification_report",
-    "cluster",
-    "cohen_kappa_score",
-    "completeness_score",
-    "ConfusionMatrixDisplay",
-    "confusion_matrix",
-    "consensus_score",
-    "coverage_error",
-    "d2_tweedie_score",
-    "dcg_score",
-    "davies_bouldin_score",
-    "DetCurveDisplay",
-    "det_curve",
-    "DistanceMetric",
-    "euclidean_distances",
-    "explained_variance_score",
-    "f1_score",
-    "fbeta_score",
-    "fowlkes_mallows_score",
-    "get_scorer",
-    "hamming_loss",
-    "hinge_loss",
-    "homogeneity_completeness_v_measure",
-    "homogeneity_score",
-    "jaccard_score",
-    "label_ranking_average_precision_score",
-    "label_ranking_loss",
-    "log_loss",
-    "make_scorer",
-    "nan_euclidean_distances",
-    "matthews_corrcoef",
-    "max_error",
-    "mean_absolute_error",
-    "mean_squared_error",
-    "mean_squared_log_error",
-    "mean_pinball_loss",
-    "mean_poisson_deviance",
-    "mean_gamma_deviance",
-    "mean_tweedie_deviance",
-    "median_absolute_error",
-    "mean_absolute_percentage_error",
-    "multilabel_confusion_matrix",
-    "mutual_info_score",
-    "ndcg_score",
-    "normalized_mutual_info_score",
-    "pair_confusion_matrix",
-    "pairwise_distances",
-    "pairwise_distances_argmin",
-    "pairwise_distances_argmin_min",
-    "pairwise_distances_chunked",
-    "pairwise_kernels",
-    "plot_confusion_matrix",
-    "plot_det_curve",
-    "plot_precision_recall_curve",
-    "plot_roc_curve",
-    "PrecisionRecallDisplay",
-    "precision_recall_curve",
-    "precision_recall_fscore_support",
-    "precision_score",
-    "r2_score",
-    "rand_score",
-    "recall_score",
-    "RocCurveDisplay",
-    "roc_auc_score",
-    "roc_curve",
-    "SCORERS",
-    "silhouette_samples",
-    "silhouette_score",
-    "top_k_accuracy_score",
-    "v_measure_score",
-    "zero_one_loss",
-    "brier_score_loss",
-=======
     'accuracy_score',
     'adjusted_mutual_info_score',
     'adjusted_rand_score',
@@ -195,6 +113,7 @@
     'confusion_matrix',
     'consensus_score',
     'coverage_error',
+    'cumulative_gain_curve',
     'dcg_score',
     'davies_bouldin_score',
     'DetCurveDisplay',
@@ -250,7 +169,6 @@
     'RocCurveDisplay',
     'roc_auc_score',
     'roc_curve',
-    'cumulative_gain_curve',
     'SCORERS',
     'silhouette_samples',
     'silhouette_score',
@@ -258,5 +176,4 @@
     'v_measure_score',
     'zero_one_loss',
     'brier_score_loss',
->>>>>>> fa66ae8c2 (FEA cumulative_gain_curve correct syntax)
 ]

From a9912df17b42eb003a7e41c40423061c864090d4 Mon Sep 17 00:00:00 2001
From: Alit10 <tber.ali@gmail.com>
Date: Wed, 22 Dec 2021 11:14:10 +0100
Subject: [PATCH 6/6] Fix linting error2

---
 sklearn/metrics/_ranking.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py
index e60f0ab0d2a2e..27263790b90e4 100644
--- a/sklearn/metrics/_ranking.py
+++ b/sklearn/metrics/_ranking.py
@@ -1007,7 +1007,6 @@ def roc_curve(
     return fpr, tpr, thresholds
 
 
-@_deprecate_positional_args
 def cumulative_gain_curve(y_true, y_score, pos_label=None):
     """Compute Cumulative Gain for each ten percent of the sample
     Note: This implementation is restricted to the binary classification task.