From 02fecc7a54521a4e493d0f948152c8ecd41b6e00 Mon Sep 17 00:00:00 2001 From: Ali TBER Date: Mon, 28 Sep 2020 16:16:43 +0200 Subject: [PATCH 1/6] FEA cumulative_gain_curve --- sklearn/metrics/_ranking.py | 69 +++++++++++++++++++++++++++ sklearn/metrics/tests/test_ranking.py | 1 + 2 files changed, 70 insertions(+) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index 36feb4e91a5db..f3734b85ad261 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -1006,6 +1006,75 @@ def roc_curve( return fpr, tpr, thresholds +def cumulative_gain_curve(y_true, y_score, pos_label=None): + """This function generates the points necessary to plot the Cumulative Gain for each ten percent of the samples + Note: This implementation is restricted to the binary classification task. + + Parameters + ---------- + + y_true (array-like, shape (n_samples)): True labels of the data. + y_score (array-like, shape (n_samples)): Target scores, can either be + probability estimates of the positive class, confidence values, or + non-thresholded measure of decisions (as returned by + decision_function on some classifiers). + pos_label (int or str, default=None): Label considered as positive and + others are considered negative + Returns + ------- + percentages (numpy.ndarray): An array containing the X-axis values for + plotting the Cumulative Gains chart. + gains (numpy.ndarray): An array containing the Y-axis values for one + curve of the Cumulative Gains chart. + Raises: + ValueError: If `y_true` is not composed of 2 classes. The Cumulative + Gain Chart is only relevant in binary classification. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import metrics + >>> y_true = [0, 1, 1, 0, 0, 0, 1, 1, 0, 0] + >>> y_pred = [0.1, 0.8, 0.9, 0,3, 0.4, 0.6, 0.6, 0.6, 0.44] + >>> percentages, gains = metrics.cumulative_gain_curve(y_true, y_pred, pos_label=1) + >>> percentages + array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]) + >>> gains + array([0. , 0. , 0.25, 0.5 , 0.5 , 0.75, 1. , 1. , 1. , 1. , 1. ] + + """ + y_true, y_score = np.asarray(y_true), np.asarray(y_score) + + # ensure binary classification if pos_label is not specified + classes = np.unique(y_true) + if (pos_label is None and + not (np.array_equal(classes, [0, 1]) or + np.array_equal(classes, [-1, 1]) or + np.array_equal(classes, [0]) or + np.array_equal(classes, [-1]) or + np.array_equal(classes, [1]))): + raise ValueError("Data is not binary and pos_label is not specified") + elif pos_label is None: + pos_label = 1. + + # make y_true a boolean vector + y_true = (y_true == pos_label) + + sorted_indices = np.argsort(y_score)[::-1] + y_true = y_true[sorted_indices] + gains = np.cumsum(y_true) + + percentages = np.arange(start=1, stop=len(y_true) + 1) + + gains = gains / float(np.sum(y_true)) + percentages = percentages / float(len(y_true)) + + gains = np.insert(gains, 0, [0]) + percentages = np.insert(percentages, 0, [0]) + + return percentages, gains + + def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None): """Compute ranking-based average precision. diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 01de37b189733..519187ac162c7 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -25,6 +25,7 @@ from sklearn.metrics import det_curve from sklearn.metrics import label_ranking_average_precision_score from sklearn.metrics import precision_recall_curve +from sklearn.metrics import cumulative_gain_curve from sklearn.metrics import label_ranking_loss from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve From 468ad22044d344e79cefe5cc0db45745502ba1c7 Mon Sep 17 00:00:00 2001 From: Ali TBER Date: Mon, 1 Mar 2021 20:31:45 +0100 Subject: [PATCH 2/6] FEA cumulative_gain_curve --- sklearn/linear_model/sag_fast.pyx | 1357 +++++++++++++++++++++++ sklearn/metrics/__init__.py | 1 + sklearn/metrics/ranking.py | 1461 +++++++++++++++++++++++++ sklearn/metrics/tests/test_ranking.py | 6 +- sklearn/utils/seq_dataset.pxd | 116 ++ sklearn/utils/seq_dataset.pyx | 653 +++++++++++ 6 files changed, 3593 insertions(+), 1 deletion(-) create mode 100644 sklearn/linear_model/sag_fast.pyx create mode 100644 sklearn/metrics/ranking.py create mode 100644 sklearn/utils/seq_dataset.pxd create mode 100644 sklearn/utils/seq_dataset.pyx diff --git a/sklearn/linear_model/sag_fast.pyx b/sklearn/linear_model/sag_fast.pyx new file mode 100644 index 0000000000000..6d48b65bda560 --- /dev/null +++ b/sklearn/linear_model/sag_fast.pyx @@ -0,0 +1,1357 @@ + +#------------------------------------------------------------------------------ + +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False +# +# Authors: Danny Sullivan +# Tom Dupre la Tour +# Arthur Mensch y: + return x + return y + +cdef inline float fmax32(float x, float y) nogil: + if x > y: + return x + return y + +cdef double _logsumexp64(double* arr, int n_classes) nogil: + """Computes the sum of arr assuming arr is in the log domain. + + Returns log(sum(exp(arr))) while minimizing the possibility of + over/underflow. + """ + # Use the max to normalize, as with the log this is what accumulates + # the less errors + cdef double vmax = arr[0] + cdef double out = 0.0 + cdef int i + + for i in range(1, n_classes): + if vmax < arr[i]: + vmax = arr[i] + + for i in range(n_classes): + out += exp(arr[i] - vmax) + + return log(out) + vmax + +cdef float _logsumexp32(float* arr, int n_classes) nogil: + """Computes the sum of arr assuming arr is in the log domain. + + Returns log(sum(exp(arr))) while minimizing the possibility of + over/underflow. + """ + # Use the max to normalize, as with the log this is what accumulates + # the less errors + cdef float vmax = arr[0] + cdef float out = 0.0 + cdef int i + + for i in range(1, n_classes): + if vmax < arr[i]: + vmax = arr[i] + + for i in range(n_classes): + out += exp(arr[i] - vmax) + + return log(out) + vmax + +cdef class MultinomialLogLoss64: + cdef double _loss(self, double* prediction, double y, int n_classes, + double sample_weight) nogil: + r"""Multinomial Logistic regression loss. + + The multinomial logistic loss for one sample is: + loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction)) + = sw (logsumexp(prediction) - prediction[y]) + + where: + prediction = dot(x_sample, weights) + intercept + \delta_{y,c} = 1 if (y == c) else 0 + sw = sample_weight + + Parameters + ---------- + prediction : pointer to a np.ndarray[double] of shape (n_classes,) + Prediction of the multinomial classifier, for current sample. + + y : double, between 0 and n_classes - 1 + Indice of the correct class for current sample (i.e. label encoded). + + n_classes : integer + Total number of classes. + + sample_weight : double + Weight of current sample. + + Returns + ------- + loss : double + Multinomial loss for current sample. + + Reference + --------- + Bishop, C. M. (2006). Pattern recognition and machine learning. + Springer. (Chapter 4.3.4) + """ + cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes) + cdef double loss + + # y is the indice of the correct class of current sample. + loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight + return loss + + cdef void _dloss(self, double* prediction, double y, int n_classes, + double sample_weight, double* gradient_ptr) nogil: + r"""Multinomial Logistic regression gradient of the loss. + + The gradient of the multinomial logistic loss with respect to a class c, + and for one sample is: + grad_c = - sw * (p[c] - \delta_{y,c}) + + where: + p[c] = exp(logsumexp(prediction) - prediction[c]) + prediction = dot(sample, weights) + intercept + \delta_{y,c} = 1 if (y == c) else 0 + sw = sample_weight + + Note that to obtain the true gradient, this value has to be multiplied + by the sample vector x. + + Parameters + ---------- + prediction : pointer to a np.ndarray[double] of shape (n_classes,) + Prediction of the multinomial classifier, for current sample. + + y : double, between 0 and n_classes - 1 + Indice of the correct class for current sample (i.e. label encoded) + + n_classes : integer + Total number of classes. + + sample_weight : double + Weight of current sample. + + gradient_ptr : pointer to a np.ndarray[double] of shape (n_classes,) + Gradient vector to be filled. + + Reference + --------- + Bishop, C. M. (2006). Pattern recognition and machine learning. + Springer. (Chapter 4.3.4) + """ + cdef double logsumexp_prediction = _logsumexp64(prediction, n_classes) + cdef int class_ind + + for class_ind in range(n_classes): + gradient_ptr[class_ind] = exp(prediction[class_ind] - + logsumexp_prediction) + + # y is the indice of the correct class of current sample. + if class_ind == y: + gradient_ptr[class_ind] -= 1.0 + + gradient_ptr[class_ind] *= sample_weight + + def __reduce__(self): + return MultinomialLogLoss64, () + +cdef class MultinomialLogLoss32: + cdef float _loss(self, float* prediction, float y, int n_classes, + float sample_weight) nogil: + r"""Multinomial Logistic regression loss. + + The multinomial logistic loss for one sample is: + loss = - sw \sum_c \delta_{y,c} (prediction[c] - logsumexp(prediction)) + = sw (logsumexp(prediction) - prediction[y]) + + where: + prediction = dot(x_sample, weights) + intercept + \delta_{y,c} = 1 if (y == c) else 0 + sw = sample_weight + + Parameters + ---------- + prediction : pointer to a np.ndarray[float] of shape (n_classes,) + Prediction of the multinomial classifier, for current sample. + + y : float, between 0 and n_classes - 1 + Indice of the correct class for current sample (i.e. label encoded). + + n_classes : integer + Total number of classes. + + sample_weight : float + Weight of current sample. + + Returns + ------- + loss : float + Multinomial loss for current sample. + + Reference + --------- + Bishop, C. M. (2006). Pattern recognition and machine learning. + Springer. (Chapter 4.3.4) + """ + cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes) + cdef float loss + + # y is the indice of the correct class of current sample. + loss = (logsumexp_prediction - prediction[int(y)]) * sample_weight + return loss + + cdef void _dloss(self, float* prediction, float y, int n_classes, + float sample_weight, float* gradient_ptr) nogil: + r"""Multinomial Logistic regression gradient of the loss. + + The gradient of the multinomial logistic loss with respect to a class c, + and for one sample is: + grad_c = - sw * (p[c] - \delta_{y,c}) + + where: + p[c] = exp(logsumexp(prediction) - prediction[c]) + prediction = dot(sample, weights) + intercept + \delta_{y,c} = 1 if (y == c) else 0 + sw = sample_weight + + Note that to obtain the true gradient, this value has to be multiplied + by the sample vector x. + + Parameters + ---------- + prediction : pointer to a np.ndarray[float] of shape (n_classes,) + Prediction of the multinomial classifier, for current sample. + + y : float, between 0 and n_classes - 1 + Indice of the correct class for current sample (i.e. label encoded) + + n_classes : integer + Total number of classes. + + sample_weight : float + Weight of current sample. + + gradient_ptr : pointer to a np.ndarray[float] of shape (n_classes,) + Gradient vector to be filled. + + Reference + --------- + Bishop, C. M. (2006). Pattern recognition and machine learning. + Springer. (Chapter 4.3.4) + """ + cdef float logsumexp_prediction = _logsumexp32(prediction, n_classes) + cdef int class_ind + + for class_ind in range(n_classes): + gradient_ptr[class_ind] = exp(prediction[class_ind] - + logsumexp_prediction) + + # y is the indice of the correct class of current sample. + if class_ind == y: + gradient_ptr[class_ind] -= 1.0 + + gradient_ptr[class_ind] *= sample_weight + + def __reduce__(self): + return MultinomialLogLoss32, () + +cdef inline double _soft_thresholding64(double x, double shrinkage) nogil: + return fmax64(x - shrinkage, 0) - fmax64(- x - shrinkage, 0) + +cdef inline float _soft_thresholding32(float x, float shrinkage) nogil: + return fmax32(x - shrinkage, 0) - fmax32(- x - shrinkage, 0) + +def sag64(SequentialDataset64 dataset, + np.ndarray[double, ndim=2, mode='c'] weights_array, + np.ndarray[double, ndim=1, mode='c'] intercept_array, + int n_samples, + int n_features, + int n_classes, + double tol, + int max_iter, + str loss_function, + double step_size, + double alpha, + double beta, + np.ndarray[double, ndim=2, mode='c'] sum_gradient_init, + np.ndarray[double, ndim=2, mode='c'] gradient_memory_init, + np.ndarray[bint, ndim=1, mode='c'] seen_init, + int num_seen, + bint fit_intercept, + np.ndarray[double, ndim=1, mode='c'] intercept_sum_gradient_init, + double intercept_decay, + bint saga, + bint verbose): + """Stochastic Average Gradient (SAG) and SAGA solvers. + + Used in Ridge and LogisticRegression. + + Reference + --------- + Schmidt, M., Roux, N. L., & Bach, F. (2013). + Minimizing finite sums with the stochastic average gradient + https://hal.inria.fr/hal-00860051/document + (section 4.3) + + Defazio, A., Bach, F., Lacoste-Julien, S. (2014), + SAGA: A Fast Incremental Gradient Method With Support + for Non-Strongly Convex Composite Objectives + https://arxiv.org/abs/1407.0202 + + """ + # the data pointer for x, the current sample + cdef double *x_data_ptr = NULL + # the index pointer for the column of the data + cdef int *x_ind_ptr = NULL + # the number of non-zero features for current sample + cdef int xnnz = -1 + # the label value for current sample + # the label value for curent sample + cdef double y + # the sample weight + cdef double sample_weight + + # helper variable for indexes + cdef int f_idx, s_idx, feature_ind, class_ind, j + # the number of pass through all samples + cdef int n_iter = 0 + # helper to track iterations through samples + cdef int sample_itr + # the index (row number) of the current sample + cdef int sample_ind + + # the maximum change in weights, used to compute stopping criteria + cdef double max_change + # a holder variable for the max weight, used to compute stopping criteria + cdef double max_weight + + # the start time of the fit + cdef time_t start_time + # the end time of the fit + cdef time_t end_time + + # precomputation since the step size does not change in this implementation + cdef double wscale_update = 1.0 - step_size * alpha + + # vector of booleans indicating whether this sample has been seen + cdef bint* seen = seen_init.data + + # helper for cumulative sum + cdef double cum_sum + + # the pointer to the coef_ or weights + cdef double* weights = weights_array.data + # the pointer to the intercept_array + cdef double* intercept = intercept_array.data + + # the pointer to the intercept_sum_gradient + cdef double* intercept_sum_gradient = \ + intercept_sum_gradient_init.data + + # the sum of gradients for each feature + cdef double* sum_gradient = sum_gradient_init.data + # the previously seen gradient for each sample + cdef double* gradient_memory = gradient_memory_init.data + + # the cumulative sums needed for JIT params + cdef np.ndarray[double, ndim=1] cumulative_sums_array = \ + np.empty(n_samples, dtype=np.float64, order="c") + cdef double* cumulative_sums = cumulative_sums_array.data + + # the index for the last time this feature was updated + cdef np.ndarray[int, ndim=1] feature_hist_array = \ + np.zeros(n_features, dtype=np.int32, order="c") + cdef int* feature_hist = feature_hist_array.data + + # the previous weights to use to compute stopping criteria + cdef np.ndarray[double, ndim=2] previous_weights_array = \ + np.zeros((n_features, n_classes), dtype=np.float64, order="c") + cdef double* previous_weights = previous_weights_array.data + + cdef np.ndarray[double, ndim=1] prediction_array = \ + np.zeros(n_classes, dtype=np.float64, order="c") + cdef double* prediction = prediction_array.data + + cdef np.ndarray[double, ndim=1] gradient_array = \ + np.zeros(n_classes, dtype=np.float64, order="c") + cdef double* gradient = gradient_array.data + + # Intermediate variable that need declaration since cython cannot infer when templating + cdef double val + + # Bias correction term in saga + cdef double gradient_correction + + # the scalar used for multiplying z + cdef double wscale = 1.0 + + # return value (-1 if an error occurred, 0 otherwise) + cdef int status = 0 + + # the cumulative sums for each iteration for the sparse implementation + cumulative_sums[0] = 0.0 + + # the multipliative scale needed for JIT params + cdef np.ndarray[double, ndim=1] cumulative_sums_prox_array + cdef double* cumulative_sums_prox + + cdef bint prox = beta > 0 and saga + + # Loss function to optimize + cdef LossFunction loss + # Wether the loss function is multinomial + cdef bint multinomial = False + # Multinomial loss function + cdef MultinomialLogLoss64 multiloss + + if loss_function == "multinomial": + multinomial = True + multiloss = MultinomialLogLoss64() + elif loss_function == "log": + loss = Log() + elif loss_function == "squared": + loss = SquaredLoss() + else: + raise ValueError("Invalid loss parameter: got %s instead of " + "one of ('log', 'squared', 'multinomial')" + % loss_function) + + if prox: + cumulative_sums_prox_array = np.empty(n_samples, + dtype=np.float64, order="c") + cumulative_sums_prox = cumulative_sums_prox_array.data + else: + cumulative_sums_prox = NULL + + with nogil: + start_time = time(NULL) + for n_iter in range(max_iter): + for sample_itr in range(n_samples): + # extract a random sample + sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, + &y, &sample_weight) + + # cached index for gradient_memory + s_idx = sample_ind * n_classes + + # update the number of samples seen and the seen array + if seen[sample_ind] == 0: + num_seen += 1 + seen[sample_ind] = 1 + + # make the weight updates + if sample_itr > 0: + status = lagged_update64(weights, wscale, xnnz, + n_samples, n_classes, + sample_itr, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + x_ind_ptr, + False, + n_iter) + if status == -1: + break + + # find the current prediction + predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, + intercept, prediction, n_classes) + + # compute the gradient for this sample, given the prediction + if multinomial: + multiloss._dloss(prediction, y, n_classes, sample_weight, + gradient) + else: + gradient[0] = loss._dloss(prediction[0], y) * sample_weight + + # L2 regularization by simply rescaling the weights + wscale *= wscale_update + + # make the updates to the sum of gradients + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + val = x_data_ptr[j] + f_idx = feature_ind * n_classes + for class_ind in range(n_classes): + gradient_correction = \ + val * (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + if saga: + weights[f_idx + class_ind] -= \ + (gradient_correction * step_size + * (1 - 1. / num_seen) / wscale) + sum_gradient[f_idx + class_ind] += gradient_correction + + # fit the intercept + if fit_intercept: + for class_ind in range(n_classes): + gradient_correction = (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + intercept_sum_gradient[class_ind] += gradient_correction + gradient_correction *= step_size * (1. - 1. / num_seen) + if saga: + intercept[class_ind] -= \ + (step_size * intercept_sum_gradient[class_ind] / + num_seen * intercept_decay) + gradient_correction + else: + intercept[class_ind] -= \ + (step_size * intercept_sum_gradient[class_ind] / + num_seen * intercept_decay) + + # check to see that the intercept is not inf or NaN + if not skl_isfinite64(intercept[class_ind]): + status = -1 + break + # Break from the n_samples outer loop if an error happened + # in the fit_intercept n_classes inner loop + if status == -1: + break + + # update the gradient memory for this sample + for class_ind in range(n_classes): + gradient_memory[s_idx + class_ind] = gradient[class_ind] + + if sample_itr == 0: + cumulative_sums[0] = step_size / (wscale * num_seen) + if prox: + cumulative_sums_prox[0] = step_size * beta / wscale + else: + cumulative_sums[sample_itr] = \ + (cumulative_sums[sample_itr - 1] + + step_size / (wscale * num_seen)) + if prox: + cumulative_sums_prox[sample_itr] = \ + (cumulative_sums_prox[sample_itr - 1] + + step_size * beta / wscale) + # If wscale gets too small, we need to reset the scale. + if wscale < 1e-9: + if verbose: + with gil: + print("rescaling...") + status = scale_weights64( + weights, &wscale, n_features, n_samples, n_classes, + sample_itr, cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, sum_gradient, n_iter) + if status == -1: + break + + # Break from the n_iter outer loop if an error happened in the + # n_samples inner loop + if status == -1: + break + + # we scale the weights every n_samples iterations and reset the + # just-in-time update system for numerical stability. + status = scale_weights64(weights, &wscale, n_features, + n_samples, + n_classes, n_samples - 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, sum_gradient, n_iter) + + if status == -1: + break + # check if the stopping criteria is reached + max_change = 0.0 + max_weight = 0.0 + for idx in range(n_features * n_classes): + max_weight = fmax64(max_weight, fabs(weights[idx])) + max_change = fmax64(max_change, + fabs(weights[idx] - + previous_weights[idx])) + previous_weights[idx] = weights[idx] + if ((max_weight != 0 and max_change / max_weight <= tol) + or max_weight == 0 and max_change == 0): + if verbose: + end_time = time(NULL) + with gil: + print("convergence after %d epochs took %d seconds" % + (n_iter + 1, end_time - start_time)) + break + elif verbose: + printf('Epoch %d, change: %.8f\n', n_iter + 1, + max_change / max_weight) + n_iter += 1 + # We do the error treatment here based on error code in status to avoid + # re-acquiring the GIL within the cython code, which slows the computation + # when the sag/saga solver is used concurrently in multiple Python threads. + if status == -1: + raise ValueError(("Floating-point under-/overflow occurred at epoch" + " #%d. Scaling input data with StandardScaler or" + " MinMaxScaler might help.") % n_iter) + + if verbose and n_iter >= max_iter: + end_time = time(NULL) + print(("max_iter reached after %d seconds") % + (end_time - start_time)) + + return num_seen, n_iter + +def sag32(SequentialDataset32 dataset, + np.ndarray[float, ndim=2, mode='c'] weights_array, + np.ndarray[float, ndim=1, mode='c'] intercept_array, + int n_samples, + int n_features, + int n_classes, + double tol, + int max_iter, + str loss_function, + double step_size, + double alpha, + double beta, + np.ndarray[float, ndim=2, mode='c'] sum_gradient_init, + np.ndarray[float, ndim=2, mode='c'] gradient_memory_init, + np.ndarray[bint, ndim=1, mode='c'] seen_init, + int num_seen, + bint fit_intercept, + np.ndarray[float, ndim=1, mode='c'] intercept_sum_gradient_init, + double intercept_decay, + bint saga, + bint verbose): + """Stochastic Average Gradient (SAG) and SAGA solvers. + + Used in Ridge and LogisticRegression. + + Reference + --------- + Schmidt, M., Roux, N. L., & Bach, F. (2013). + Minimizing finite sums with the stochastic average gradient + https://hal.inria.fr/hal-00860051/document + (section 4.3) + + Defazio, A., Bach, F., Lacoste-Julien, S. (2014), + SAGA: A Fast Incremental Gradient Method With Support + for Non-Strongly Convex Composite Objectives + https://arxiv.org/abs/1407.0202 + + """ + # the data pointer for x, the current sample + cdef float *x_data_ptr = NULL + # the index pointer for the column of the data + cdef int *x_ind_ptr = NULL + # the number of non-zero features for current sample + cdef int xnnz = -1 + # the label value for current sample + # the label value for curent sample + cdef float y + # the sample weight + cdef float sample_weight + + # helper variable for indexes + cdef int f_idx, s_idx, feature_ind, class_ind, j + # the number of pass through all samples + cdef int n_iter = 0 + # helper to track iterations through samples + cdef int sample_itr + # the index (row number) of the current sample + cdef int sample_ind + + # the maximum change in weights, used to compute stopping criteria + cdef float max_change + # a holder variable for the max weight, used to compute stopping criteria + cdef float max_weight + + # the start time of the fit + cdef time_t start_time + # the end time of the fit + cdef time_t end_time + + # precomputation since the step size does not change in this implementation + cdef float wscale_update = 1.0 - step_size * alpha + + # vector of booleans indicating whether this sample has been seen + cdef bint* seen = seen_init.data + + # helper for cumulative sum + cdef float cum_sum + + # the pointer to the coef_ or weights + cdef float* weights = weights_array.data + # the pointer to the intercept_array + cdef float* intercept = intercept_array.data + + # the pointer to the intercept_sum_gradient + cdef float* intercept_sum_gradient = \ + intercept_sum_gradient_init.data + + # the sum of gradients for each feature + cdef float* sum_gradient = sum_gradient_init.data + # the previously seen gradient for each sample + cdef float* gradient_memory = gradient_memory_init.data + + # the cumulative sums needed for JIT params + cdef np.ndarray[float, ndim=1] cumulative_sums_array = \ + np.empty(n_samples, dtype=np.float32, order="c") + cdef float* cumulative_sums = cumulative_sums_array.data + + # the index for the last time this feature was updated + cdef np.ndarray[int, ndim=1] feature_hist_array = \ + np.zeros(n_features, dtype=np.int32, order="c") + cdef int* feature_hist = feature_hist_array.data + + # the previous weights to use to compute stopping criteria + cdef np.ndarray[float, ndim=2] previous_weights_array = \ + np.zeros((n_features, n_classes), dtype=np.float32, order="c") + cdef float* previous_weights = previous_weights_array.data + + cdef np.ndarray[float, ndim=1] prediction_array = \ + np.zeros(n_classes, dtype=np.float32, order="c") + cdef float* prediction = prediction_array.data + + cdef np.ndarray[float, ndim=1] gradient_array = \ + np.zeros(n_classes, dtype=np.float32, order="c") + cdef float* gradient = gradient_array.data + + # Intermediate variable that need declaration since cython cannot infer when templating + cdef float val + + # Bias correction term in saga + cdef float gradient_correction + + # the scalar used for multiplying z + cdef float wscale = 1.0 + + # return value (-1 if an error occurred, 0 otherwise) + cdef int status = 0 + + # the cumulative sums for each iteration for the sparse implementation + cumulative_sums[0] = 0.0 + + # the multipliative scale needed for JIT params + cdef np.ndarray[float, ndim=1] cumulative_sums_prox_array + cdef float* cumulative_sums_prox + + cdef bint prox = beta > 0 and saga + + # Loss function to optimize + cdef LossFunction loss + # Wether the loss function is multinomial + cdef bint multinomial = False + # Multinomial loss function + cdef MultinomialLogLoss32 multiloss + + if loss_function == "multinomial": + multinomial = True + multiloss = MultinomialLogLoss32() + elif loss_function == "log": + loss = Log() + elif loss_function == "squared": + loss = SquaredLoss() + else: + raise ValueError("Invalid loss parameter: got %s instead of " + "one of ('log', 'squared', 'multinomial')" + % loss_function) + + if prox: + cumulative_sums_prox_array = np.empty(n_samples, + dtype=np.float32, order="c") + cumulative_sums_prox = cumulative_sums_prox_array.data + else: + cumulative_sums_prox = NULL + + with nogil: + start_time = time(NULL) + for n_iter in range(max_iter): + for sample_itr in range(n_samples): + # extract a random sample + sample_ind = dataset.random(&x_data_ptr, &x_ind_ptr, &xnnz, + &y, &sample_weight) + + # cached index for gradient_memory + s_idx = sample_ind * n_classes + + # update the number of samples seen and the seen array + if seen[sample_ind] == 0: + num_seen += 1 + seen[sample_ind] = 1 + + # make the weight updates + if sample_itr > 0: + status = lagged_update32(weights, wscale, xnnz, + n_samples, n_classes, + sample_itr, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + x_ind_ptr, + False, + n_iter) + if status == -1: + break + + # find the current prediction + predict_sample32(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, + intercept, prediction, n_classes) + + # compute the gradient for this sample, given the prediction + if multinomial: + multiloss._dloss(prediction, y, n_classes, sample_weight, + gradient) + else: + gradient[0] = loss._dloss(prediction[0], y) * sample_weight + + # L2 regularization by simply rescaling the weights + wscale *= wscale_update + + # make the updates to the sum of gradients + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + val = x_data_ptr[j] + f_idx = feature_ind * n_classes + for class_ind in range(n_classes): + gradient_correction = \ + val * (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + if saga: + weights[f_idx + class_ind] -= \ + (gradient_correction * step_size + * (1 - 1. / num_seen) / wscale) + sum_gradient[f_idx + class_ind] += gradient_correction + + # fit the intercept + if fit_intercept: + for class_ind in range(n_classes): + gradient_correction = (gradient[class_ind] - + gradient_memory[s_idx + class_ind]) + intercept_sum_gradient[class_ind] += gradient_correction + gradient_correction *= step_size * (1. - 1. / num_seen) + if saga: + intercept[class_ind] -= \ + (step_size * intercept_sum_gradient[class_ind] / + num_seen * intercept_decay) + gradient_correction + else: + intercept[class_ind] -= \ + (step_size * intercept_sum_gradient[class_ind] / + num_seen * intercept_decay) + + # check to see that the intercept is not inf or NaN + if not skl_isfinite32(intercept[class_ind]): + status = -1 + break + # Break from the n_samples outer loop if an error happened + # in the fit_intercept n_classes inner loop + if status == -1: + break + + # update the gradient memory for this sample + for class_ind in range(n_classes): + gradient_memory[s_idx + class_ind] = gradient[class_ind] + + if sample_itr == 0: + cumulative_sums[0] = step_size / (wscale * num_seen) + if prox: + cumulative_sums_prox[0] = step_size * beta / wscale + else: + cumulative_sums[sample_itr] = \ + (cumulative_sums[sample_itr - 1] + + step_size / (wscale * num_seen)) + if prox: + cumulative_sums_prox[sample_itr] = \ + (cumulative_sums_prox[sample_itr - 1] + + step_size * beta / wscale) + # If wscale gets too small, we need to reset the scale. + if wscale < 1e-9: + if verbose: + with gil: + print("rescaling...") + status = scale_weights32( + weights, &wscale, n_features, n_samples, n_classes, + sample_itr, cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, sum_gradient, n_iter) + if status == -1: + break + + # Break from the n_iter outer loop if an error happened in the + # n_samples inner loop + if status == -1: + break + + # we scale the weights every n_samples iterations and reset the + # just-in-time update system for numerical stability. + status = scale_weights32(weights, &wscale, n_features, + n_samples, + n_classes, n_samples - 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, sum_gradient, n_iter) + + if status == -1: + break + # check if the stopping criteria is reached + max_change = 0.0 + max_weight = 0.0 + for idx in range(n_features * n_classes): + max_weight = fmax32(max_weight, fabs(weights[idx])) + max_change = fmax32(max_change, + fabs(weights[idx] - + previous_weights[idx])) + previous_weights[idx] = weights[idx] + if ((max_weight != 0 and max_change / max_weight <= tol) + or max_weight == 0 and max_change == 0): + if verbose: + end_time = time(NULL) + with gil: + print("convergence after %d epochs took %d seconds" % + (n_iter + 1, end_time - start_time)) + break + elif verbose: + printf('Epoch %d, change: %.8f\n', n_iter + 1, + max_change / max_weight) + n_iter += 1 + # We do the error treatment here based on error code in status to avoid + # re-acquiring the GIL within the cython code, which slows the computation + # when the sag/saga solver is used concurrently in multiple Python threads. + if status == -1: + raise ValueError(("Floating-point under-/overflow occurred at epoch" + " #%d. Scaling input data with StandardScaler or" + " MinMaxScaler might help.") % n_iter) + + if verbose and n_iter >= max_iter: + end_time = time(NULL) + print(("max_iter reached after %d seconds") % + (end_time - start_time)) + + return num_seen, n_iter + +cdef int scale_weights64(double* weights, double* wscale, + int n_features, + int n_samples, int n_classes, int sample_itr, + double* cumulative_sums, + double* cumulative_sums_prox, + int* feature_hist, + bint prox, + double* sum_gradient, + int n_iter) nogil: + """Scale the weights with wscale for numerical stability. + + wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr) + can become very small, so we reset it every n_samples iterations to 1.0 for + numerical stability. To be able to scale, we first need to update every + coefficients and reset the just-in-time update system. + This also limits the size of `cumulative_sums`. + """ + + cdef int status + status = lagged_update64(weights, wscale[0], n_features, + n_samples, n_classes, sample_itr + 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + NULL, + True, + n_iter) + # if lagged update succeeded, reset wscale to 1.0 + if status == 0: + wscale[0] = 1.0 + return status + +cdef int scale_weights32(float* weights, float* wscale, + int n_features, + int n_samples, int n_classes, int sample_itr, + float* cumulative_sums, + float* cumulative_sums_prox, + int* feature_hist, + bint prox, + float* sum_gradient, + int n_iter) nogil: + """Scale the weights with wscale for numerical stability. + + wscale = (1 - step_size * alpha) ** (n_iter * n_samples + sample_itr) + can become very small, so we reset it every n_samples iterations to 1.0 for + numerical stability. To be able to scale, we first need to update every + coefficients and reset the just-in-time update system. + This also limits the size of `cumulative_sums`. + """ + + cdef int status + status = lagged_update32(weights, wscale[0], n_features, + n_samples, n_classes, sample_itr + 1, + cumulative_sums, + cumulative_sums_prox, + feature_hist, + prox, + sum_gradient, + NULL, + True, + n_iter) + # if lagged update succeeded, reset wscale to 1.0 + if status == 0: + wscale[0] = 1.0 + return status + +cdef int lagged_update64(double* weights, double wscale, int xnnz, + int n_samples, int n_classes, int sample_itr, + double* cumulative_sums, + double* cumulative_sums_prox, + int* feature_hist, + bint prox, + double* sum_gradient, + int* x_ind_ptr, + bint reset, + int n_iter) nogil: + """Hard perform the JIT updates for non-zero features of present sample. + The updates that awaits are kept in memory using cumulative_sums, + cumulative_sums_prox, wscale and feature_hist. See original SAGA paper + (Defazio et al. 2014) for details. If reset=True, we also reset wscale to + 1 (this is done at the end of each epoch). + """ + cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind + cdef double cum_sum, grad_step, prox_step, cum_sum_prox + for feature_ind in range(xnnz): + if not reset: + feature_ind = x_ind_ptr[feature_ind] + f_idx = feature_ind * n_classes + + cum_sum = cumulative_sums[sample_itr - 1] + if prox: + cum_sum_prox = cumulative_sums_prox[sample_itr - 1] + if feature_hist[feature_ind] != 0: + cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1] + if prox: + cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1] + if not prox: + for class_ind in range(n_classes): + idx = f_idx + class_ind + weights[idx] -= cum_sum * sum_gradient[idx] + if reset: + weights[idx] *= wscale + if not skl_isfinite64(weights[idx]): + # returning here does not require the gil as the return + # type is a C integer + return -1 + else: + for class_ind in range(n_classes): + idx = f_idx + class_ind + if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox: + # In this case, we can perform all the gradient steps and + # all the proximal steps in this order, which is more + # efficient than unrolling all the lagged updates. + # Idea taken from scikit-learn-contrib/lightning. + weights[idx] -= cum_sum * sum_gradient[idx] + weights[idx] = _soft_thresholding64(weights[idx], + cum_sum_prox) + else: + last_update_ind = feature_hist[feature_ind] + if last_update_ind == -1: + last_update_ind = sample_itr - 1 + for lagged_ind in range(sample_itr - 1, + last_update_ind - 1, -1): + if lagged_ind > 0: + grad_step = (cumulative_sums[lagged_ind] + - cumulative_sums[lagged_ind - 1]) + prox_step = (cumulative_sums_prox[lagged_ind] + - cumulative_sums_prox[lagged_ind - 1]) + else: + grad_step = cumulative_sums[lagged_ind] + prox_step = cumulative_sums_prox[lagged_ind] + weights[idx] -= sum_gradient[idx] * grad_step + weights[idx] = _soft_thresholding64(weights[idx], + prox_step) + + if reset: + weights[idx] *= wscale + # check to see that the weight is not inf or NaN + if not skl_isfinite64(weights[idx]): + return -1 + if reset: + feature_hist[feature_ind] = sample_itr % n_samples + else: + feature_hist[feature_ind] = sample_itr + + if reset: + cumulative_sums[sample_itr - 1] = 0.0 + if prox: + cumulative_sums_prox[sample_itr - 1] = 0.0 + + return 0 + +cdef int lagged_update32(float* weights, float wscale, int xnnz, + int n_samples, int n_classes, int sample_itr, + float* cumulative_sums, + float* cumulative_sums_prox, + int* feature_hist, + bint prox, + float* sum_gradient, + int* x_ind_ptr, + bint reset, + int n_iter) nogil: + """Hard perform the JIT updates for non-zero features of present sample. + The updates that awaits are kept in memory using cumulative_sums, + cumulative_sums_prox, wscale and feature_hist. See original SAGA paper + (Defazio et al. 2014) for details. If reset=True, we also reset wscale to + 1 (this is done at the end of each epoch). + """ + cdef int feature_ind, class_ind, idx, f_idx, lagged_ind, last_update_ind + cdef float cum_sum, grad_step, prox_step, cum_sum_prox + for feature_ind in range(xnnz): + if not reset: + feature_ind = x_ind_ptr[feature_ind] + f_idx = feature_ind * n_classes + + cum_sum = cumulative_sums[sample_itr - 1] + if prox: + cum_sum_prox = cumulative_sums_prox[sample_itr - 1] + if feature_hist[feature_ind] != 0: + cum_sum -= cumulative_sums[feature_hist[feature_ind] - 1] + if prox: + cum_sum_prox -= cumulative_sums_prox[feature_hist[feature_ind] - 1] + if not prox: + for class_ind in range(n_classes): + idx = f_idx + class_ind + weights[idx] -= cum_sum * sum_gradient[idx] + if reset: + weights[idx] *= wscale + if not skl_isfinite32(weights[idx]): + # returning here does not require the gil as the return + # type is a C integer + return -1 + else: + for class_ind in range(n_classes): + idx = f_idx + class_ind + if fabs(sum_gradient[idx] * cum_sum) < cum_sum_prox: + # In this case, we can perform all the gradient steps and + # all the proximal steps in this order, which is more + # efficient than unrolling all the lagged updates. + # Idea taken from scikit-learn-contrib/lightning. + weights[idx] -= cum_sum * sum_gradient[idx] + weights[idx] = _soft_thresholding32(weights[idx], + cum_sum_prox) + else: + last_update_ind = feature_hist[feature_ind] + if last_update_ind == -1: + last_update_ind = sample_itr - 1 + for lagged_ind in range(sample_itr - 1, + last_update_ind - 1, -1): + if lagged_ind > 0: + grad_step = (cumulative_sums[lagged_ind] + - cumulative_sums[lagged_ind - 1]) + prox_step = (cumulative_sums_prox[lagged_ind] + - cumulative_sums_prox[lagged_ind - 1]) + else: + grad_step = cumulative_sums[lagged_ind] + prox_step = cumulative_sums_prox[lagged_ind] + weights[idx] -= sum_gradient[idx] * grad_step + weights[idx] = _soft_thresholding32(weights[idx], + prox_step) + + if reset: + weights[idx] *= wscale + # check to see that the weight is not inf or NaN + if not skl_isfinite32(weights[idx]): + return -1 + if reset: + feature_hist[feature_ind] = sample_itr % n_samples + else: + feature_hist[feature_ind] = sample_itr + + if reset: + cumulative_sums[sample_itr - 1] = 0.0 + if prox: + cumulative_sums_prox[sample_itr - 1] = 0.0 + + return 0 + +cdef void predict_sample64(double* x_data_ptr, int* x_ind_ptr, int xnnz, + double* w_data_ptr, double wscale, + double* intercept, double* prediction, + int n_classes) nogil: + """Compute the prediction given sparse sample x and dense weight w. + + Parameters + ---------- + x_data_ptr : pointer + Pointer to the data of the sample x + + x_ind_ptr : pointer + Pointer to the indices of the sample x + + xnnz : int + Number of non-zero element in the sample x + + w_data_ptr : pointer + Pointer to the data of the weights w + + wscale : double + Scale of the weights w + + intercept : pointer + Pointer to the intercept + + prediction : pointer + Pointer to store the resulting prediction + + n_classes : int + Number of classes in multinomial case. Equals 1 in binary case. + + """ + cdef int feature_ind, class_ind, j + cdef double innerprod + + for class_ind in range(n_classes): + innerprod = 0.0 + # Compute the dot product only on non-zero elements of x + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] * + x_data_ptr[j]) + + prediction[class_ind] = wscale * innerprod + intercept[class_ind] + + +cdef void predict_sample32(float* x_data_ptr, int* x_ind_ptr, int xnnz, + float* w_data_ptr, float wscale, + float* intercept, float* prediction, + int n_classes) nogil: + """Compute the prediction given sparse sample x and dense weight w. + + Parameters + ---------- + x_data_ptr : pointer + Pointer to the data of the sample x + + x_ind_ptr : pointer + Pointer to the indices of the sample x + + xnnz : int + Number of non-zero element in the sample x + + w_data_ptr : pointer + Pointer to the data of the weights w + + wscale : float + Scale of the weights w + + intercept : pointer + Pointer to the intercept + + prediction : pointer + Pointer to store the resulting prediction + + n_classes : int + Number of classes in multinomial case. Equals 1 in binary case. + + """ + cdef int feature_ind, class_ind, j + cdef float innerprod + + for class_ind in range(n_classes): + innerprod = 0.0 + # Compute the dot product only on non-zero elements of x + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + innerprod += (w_data_ptr[feature_ind * n_classes + class_ind] * + x_data_ptr[j]) + + prediction[class_ind] = wscale * innerprod + intercept[class_ind] + + + +def _multinomial_grad_loss_all_samples( + SequentialDataset64 dataset, + np.ndarray[double, ndim=2, mode='c'] weights_array, + np.ndarray[double, ndim=1, mode='c'] intercept_array, + int n_samples, int n_features, int n_classes): + """Compute multinomial gradient and loss across all samples. + + Used for testing purpose only. + """ + cdef double* weights = weights_array.data + cdef double* intercept = intercept_array.data + + cdef double *x_data_ptr = NULL + cdef int *x_ind_ptr = NULL + cdef int xnnz = -1 + cdef double y + cdef double sample_weight + + cdef double wscale = 1.0 + cdef int i, j, class_ind, feature_ind + cdef double val + cdef double sum_loss = 0.0 + + cdef MultinomialLogLoss64 multiloss = MultinomialLogLoss64() + + cdef np.ndarray[double, ndim=2] sum_gradient_array = \ + np.zeros((n_features, n_classes), dtype=np.double, order="c") + cdef double* sum_gradient = sum_gradient_array.data + + cdef np.ndarray[double, ndim=1] prediction_array = \ + np.zeros(n_classes, dtype=np.double, order="c") + cdef double* prediction = prediction_array.data + + cdef np.ndarray[double, ndim=1] gradient_array = \ + np.zeros(n_classes, dtype=np.double, order="c") + cdef double* gradient = gradient_array.data + + with nogil: + for i in range(n_samples): + # get next sample on the dataset + dataset.next(&x_data_ptr, &x_ind_ptr, &xnnz, + &y, &sample_weight) + + # prediction of the multinomial classifier for the sample + predict_sample64(x_data_ptr, x_ind_ptr, xnnz, weights, wscale, + intercept, prediction, n_classes) + + # compute the gradient for this sample, given the prediction + multiloss._dloss(prediction, y, n_classes, sample_weight, gradient) + + # compute the loss for this sample, given the prediction + sum_loss += multiloss._loss(prediction, y, n_classes, sample_weight) + + # update the sum of the gradient + for j in range(xnnz): + feature_ind = x_ind_ptr[j] + val = x_data_ptr[j] + for class_ind in range(n_classes): + sum_gradient[feature_ind * n_classes + class_ind] += \ + gradient[class_ind] * val + + return sum_loss, sum_gradient_array diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index e4339229c5b64..20cb6ed9404b1 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -16,6 +16,7 @@ from ._ranking import roc_auc_score from ._ranking import roc_curve from ._ranking import top_k_accuracy_score +from ._ranking import cumulative_gain_curve from ._classification import accuracy_score from ._classification import balanced_accuracy_score diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py new file mode 100644 index 0000000000000..73dabf40ab5e5 --- /dev/null +++ b/sklearn/metrics/ranking.py @@ -0,0 +1,1461 @@ +"""Metrics to assess performance on classification task given scores + +Functions named as ``*_score`` return a scalar value to maximize: the higher +the better + +Function named as ``*_error`` or ``*_loss`` return a scalar value to minimize: +the lower the better +""" + +# Authors: Alexandre Gramfort +# Mathieu Blondel +# Olivier Grisel +# Arnaud Joly +# Jochen Wersdorfer +# Lars Buitinck +# Joel Nothman +# Noel Dawe +# License: BSD 3 clause + + +import warnings +from functools import partial + +import numpy as np +from scipy.sparse import csr_matrix +from scipy.stats import rankdata + +from ..utils import assert_all_finite +from ..utils import check_consistent_length +from ..utils import column_or_1d, check_array +from ..utils.multiclass import type_of_target +from ..utils.extmath import stable_cumsum +from ..utils.sparsefuncs import count_nonzero +from ..exceptions import UndefinedMetricWarning +from ..preprocessing import label_binarize +from ..preprocessing.label import _encode + +from .base import _average_binary_score, _average_multiclass_ovo_score + + +def auc(x, y): + """Compute Area Under the Curve (AUC) using the trapezoidal rule + + This is a general function, given points on a curve. For computing the + area under the ROC-curve, see :func:`roc_auc_score`. For an alternative + way to summarize a precision-recall curve, see + :func:`average_precision_score`. + + Parameters + ---------- + x : array, shape = [n] + x coordinates. These must be either monotonic increasing or monotonic + decreasing. + y : array, shape = [n] + y coordinates. + + Returns + ------- + auc : float + + Examples + -------- + >>> import numpy as np + >>> from sklearn import metrics + >>> y = np.array([1, 1, 2, 2]) + >>> pred = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2) + >>> metrics.auc(fpr, tpr) + 0.75 + + See also + -------- + roc_auc_score : Compute the area under the ROC curve + average_precision_score : Compute average precision from prediction scores + precision_recall_curve : + Compute precision-recall pairs for different probability thresholds + """ + check_consistent_length(x, y) + x = column_or_1d(x) + y = column_or_1d(y) + + if x.shape[0] < 2: + raise ValueError('At least 2 points are needed to compute' + ' area under curve, but x.shape = %s' % x.shape) + + direction = 1 + dx = np.diff(x) + if np.any(dx < 0): + if np.all(dx <= 0): + direction = -1 + else: + raise ValueError("x is neither increasing nor decreasing " + ": {}.".format(x)) + + area = direction * np.trapz(y, x) + if isinstance(area, np.memmap): + # Reductions such as .sum used internally in np.trapz do not return a + # scalar by default for numpy.memmap instances contrary to + # regular numpy.ndarray instances. + area = area.dtype.type(area) + return area + + +def average_precision_score(y_true, y_score, average="macro", pos_label=1, + sample_weight=None): + """Compute average precision (AP) from prediction scores + + AP summarizes a precision-recall curve as the weighted mean of precisions + achieved at each threshold, with the increase in recall from the previous + threshold used as the weight: + + .. math:: + \\text{AP} = \\sum_n (R_n - R_{n-1}) P_n + + where :math:`P_n` and :math:`R_n` are the precision and recall at the nth + threshold [1]_. This implementation is not interpolated and is different + from computing the area under the precision-recall curve with the + trapezoidal rule, which uses linear interpolation and can be too + optimistic. + + Note: this implementation is restricted to the binary classification task + or multilabel classification task. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array, shape = [n_samples] or [n_samples, n_classes] + True binary labels or binary label indicators. + + y_score : array, shape = [n_samples] or [n_samples, n_classes] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + If ``None``, the scores for each class are returned. Otherwise, + this determines the type of averaging performed on the data: + + ``'micro'``: + Calculate metrics globally by considering each element of the label + indicator matrix as a label. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). + ``'samples'``: + Calculate metrics for each instance, and find their average. + + Will be ignored when ``y_true`` is binary. + + pos_label : int or str (default=1) + The label of the positive class. Only applied to binary ``y_true``. + For multilabel-indicator ``y_true``, ``pos_label`` is fixed to 1. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + average_precision : float + + References + ---------- + .. [1] `Wikipedia entry for the Average precision + `_ + + See also + -------- + roc_auc_score : Compute the area under the ROC curve + + precision_recall_curve : + Compute precision-recall pairs for different probability thresholds + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import average_precision_score + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> average_precision_score(y_true, y_scores) + 0.83... + + Notes + ----- + .. versionchanged:: 0.19 + Instead of linearly interpolating between operating points, precisions + are weighted by the change in recall since the last operating point. + """ + def _binary_uninterpolated_average_precision( + y_true, y_score, pos_label=1, sample_weight=None): + precision, recall, _ = precision_recall_curve( + y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) + # Return the step function integral + # The following works because the last entry of precision is + # guaranteed to be 1, as returned by precision_recall_curve + return -np.sum(np.diff(recall) * np.array(precision)[:-1]) + + y_type = type_of_target(y_true) + if y_type == "multilabel-indicator" and pos_label != 1: + raise ValueError("Parameter pos_label is fixed to 1 for " + "multilabel-indicator y_true. Do not set " + "pos_label or set pos_label to 1.") + elif y_type == "binary": + present_labels = np.unique(y_true) + if len(present_labels) == 2 and pos_label not in present_labels: + raise ValueError("pos_label=%r is invalid. Set it to a label in " + "y_true." % pos_label) + average_precision = partial(_binary_uninterpolated_average_precision, + pos_label=pos_label) + return _average_binary_score(average_precision, y_true, y_score, + average, sample_weight=sample_weight) + + +def _binary_roc_auc_score(y_true, y_score, sample_weight=None, max_fpr=None): + """Binary roc auc score""" + if len(np.unique(y_true)) != 2: + raise ValueError("Only one class present in y_true. ROC AUC score " + "is not defined in that case.") + + fpr, tpr, _ = roc_curve(y_true, y_score, + sample_weight=sample_weight) + if max_fpr is None or max_fpr == 1: + return auc(fpr, tpr) + if max_fpr <= 0 or max_fpr > 1: + raise ValueError("Expected max_fpr in range (0, 1], got: %r" % max_fpr) + + # Add a single point at max_fpr by linear interpolation + stop = np.searchsorted(fpr, max_fpr, 'right') + x_interp = [fpr[stop - 1], fpr[stop]] + y_interp = [tpr[stop - 1], tpr[stop]] + tpr = np.append(tpr[:stop], np.interp(max_fpr, x_interp, y_interp)) + fpr = np.append(fpr[:stop], max_fpr) + partial_auc = auc(fpr, tpr) + + # McClish correction: standardize result to be 0.5 if non-discriminant + # and 1 if maximal + min_area = 0.5 * max_fpr**2 + max_area = max_fpr + return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) + + +def roc_auc_score(y_true, y_score, average="macro", sample_weight=None, + max_fpr=None, multi_class="raise", labels=None): + """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) + from prediction scores. + + Note: this implementation is restricted to the binary classification task + or multilabel classification task in label indicator format. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array, shape = [n_samples] or [n_samples, n_classes] + True binary labels or binary label indicators. + The multiclass case expects shape = [n_samples] and labels + with values in ``range(n_classes)``. + + y_score : array, shape = [n_samples] or [n_samples, n_classes] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). For binary + y_true, y_score is supposed to be the score of the class with greater + label. The multiclass case expects shape = [n_samples, n_classes] + where the scores correspond to probability estimates. + + average : string, [None, 'micro', 'macro' (default), 'samples', 'weighted'] + If ``None``, the scores for each class are returned. Otherwise, + this determines the type of averaging performed on the data: + Note: multiclass ROC AUC currently only handles the 'macro' and + 'weighted' averages. + + ``'micro'``: + Calculate metrics globally by considering each element of the label + indicator matrix as a label. + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. + ``'weighted'``: + Calculate metrics for each label, and find their average, weighted + by support (the number of true instances for each label). + ``'samples'``: + Calculate metrics for each instance, and find their average. + + Will be ignored when ``y_true`` is binary. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + max_fpr : float > 0 and <= 1, optional + If not ``None``, the standardized partial AUC [3]_ over the range + [0, max_fpr] is returned. For the multiclass case, ``max_fpr``, + should be either equal to ``None`` or ``1.0`` as AUC ROC partial + computation currently is not supported for multiclass. + + multi_class : string, 'ovr' or 'ovo', optional(default='raise') + Determines the type of multiclass configuration to use. + ``multi_class`` must be provided when ``y_true`` is multiclass. + + ``'ovr'``: + Calculate metrics for the multiclass case using the one-vs-rest + approach. + ``'ovo'``: + Calculate metrics for the multiclass case using the one-vs-one + approach. + + labels : array, shape = [n_classes] or None, optional (default=None) + List of labels to index ``y_score`` used for multiclass. If ``None``, + the lexicon order of ``y_true`` is used to index ``y_score``. + + Returns + ------- + auc : float + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + `_ + + .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition + Letters, 2006, 27(8):861-874. + + .. [3] `Analyzing a portion of the ROC curve. McClish, 1989 + `_ + + See also + -------- + average_precision_score : Area under the precision-recall curve + + roc_curve : Compute Receiver operating characteristic (ROC) curve + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import roc_auc_score + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> roc_auc_score(y_true, y_scores) + 0.75 + + """ + + y_type = type_of_target(y_true) + y_true = check_array(y_true, ensure_2d=False, dtype=None) + y_score = check_array(y_score, ensure_2d=False) + + if y_type == "multiclass" or (y_type == "binary" and + y_score.ndim == 2 and + y_score.shape[1] > 2): + # do not support partial ROC computation for multiclass + if max_fpr is not None and max_fpr != 1.: + raise ValueError("Partial AUC computation not available in " + "multiclass setting, 'max_fpr' must be" + " set to `None`, received `max_fpr={0}` " + "instead".format(max_fpr)) + if multi_class == 'raise': + raise ValueError("multi_class must be in ('ovo', 'ovr')") + return _multiclass_roc_auc_score(y_true, y_score, labels, + multi_class, average, sample_weight) + elif y_type == "binary": + labels = np.unique(y_true) + y_true = label_binarize(y_true, labels)[:, 0] + return _average_binary_score(partial(_binary_roc_auc_score, + max_fpr=max_fpr), + y_true, y_score, average, + sample_weight=sample_weight) + else: # multilabel-indicator + return _average_binary_score(partial(_binary_roc_auc_score, + max_fpr=max_fpr), + y_true, y_score, average, + sample_weight=sample_weight) + + +def _multiclass_roc_auc_score(y_true, y_score, labels, + multi_class, average, sample_weight): + """Multiclass roc auc score + + Parameters + ---------- + y_true : array-like, shape = (n_samples, ) + True multiclass labels. + + y_score : array-like, shape = (n_samples, n_classes) + Target scores corresponding to probability estimates of a sample + belonging to a particular class + + labels : array, shape = [n_classes] or None, optional (default=None) + List of labels to index ``y_score`` used for multiclass. If ``None``, + the lexical order of ``y_true`` is used to index ``y_score``. + + multi_class : string, 'ovr' or 'ovo' + Determines the type of multiclass configuration to use. + ``'ovr'``: + Calculate metrics for the multiclass case using the one-vs-rest + approach. + ``'ovo'``: + Calculate metrics for the multiclass case using the one-vs-one + approach. + + average : 'macro' or 'weighted', optional (default='macro') + Determines the type of averaging performed on the pairwise binary + metric scores + ``'macro'``: + Calculate metrics for each label, and find their unweighted + mean. This does not take label imbalance into account. Classes + are assumed to be uniformly distributed. + ``'weighted'``: + Calculate metrics for each label, taking into account the + prevalence of the classes. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + """ + # validation of the input y_score + if not np.allclose(1, y_score.sum(axis=1)): + raise ValueError( + "Target scores need to be probabilities for multiclass " + "roc_auc, i.e. they should sum up to 1.0 over classes") + + # validation for multiclass parameter specifications + average_options = ("macro", "weighted") + if average not in average_options: + raise ValueError("average must be one of {0} for " + "multiclass problems".format(average_options)) + + multiclass_options = ("ovo", "ovr") + if multi_class not in multiclass_options: + raise ValueError("multi_class='{0}' is not supported " + "for multiclass ROC AUC, multi_class must be " + "in {1}".format( + multi_class, multiclass_options)) + + if labels is not None: + labels = column_or_1d(labels) + classes = _encode(labels) + if len(classes) != len(labels): + raise ValueError("Parameter 'labels' must be unique") + if not np.array_equal(classes, labels): + raise ValueError("Parameter 'labels' must be ordered") + if len(classes) != y_score.shape[1]: + raise ValueError( + "Number of given labels, {0}, not equal to the number " + "of columns in 'y_score', {1}".format( + len(classes), y_score.shape[1])) + if len(np.setdiff1d(y_true, classes)): + raise ValueError( + "'y_true' contains labels not in parameter 'labels'") + else: + classes = _encode(y_true) + if len(classes) != y_score.shape[1]: + raise ValueError( + "Number of classes in y_true not equal to the number of " + "columns in 'y_score'") + + if multi_class == "ovo": + if sample_weight is not None: + raise ValueError("sample_weight is not supported " + "for multiclass one-vs-one ROC AUC, " + "'sample_weight' must be None in this case.") + _, y_true_encoded = _encode(y_true, uniques=classes, encode=True) + # Hand & Till (2001) implementation (ovo) + return _average_multiclass_ovo_score(_binary_roc_auc_score, + y_true_encoded, + y_score, average=average) + else: + # ovr is same as multi-label + y_true_multilabel = label_binarize(y_true, classes) + return _average_binary_score(_binary_roc_auc_score, y_true_multilabel, + y_score, average, + sample_weight=sample_weight) + + +def _binary_clf_curve(y_true, y_score, pos_label=None, sample_weight=None): + """Calculate true and false positives per binary classification threshold. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True targets of binary classification + + y_score : array, shape = [n_samples] + Estimated probabilities or decision function + + pos_label : int or str, default=None + The label of the positive class + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + fps : array, shape = [n_thresholds] + A count of false positives, at index i being the number of negative + samples assigned a score >= thresholds[i]. The total number of + negative samples is equal to fps[-1] (thus true negatives are given by + fps[-1] - fps). + + tps : array, shape = [n_thresholds <= len(np.unique(y_score))] + An increasing count of true positives, at index i being the number + of positive samples assigned a score >= thresholds[i]. The total + number of positive samples is equal to tps[-1] (thus false negatives + are given by tps[-1] - tps). + + thresholds : array, shape = [n_thresholds] + Decreasing score values. + """ + # Check to make sure y_true is valid + y_type = type_of_target(y_true) + if not (y_type == "binary" or + (y_type == "multiclass" and pos_label is not None)): + raise ValueError("{0} format is not supported".format(y_type)) + + check_consistent_length(y_true, y_score, sample_weight) + y_true = column_or_1d(y_true) + y_score = column_or_1d(y_score) + assert_all_finite(y_true) + assert_all_finite(y_score) + + if sample_weight is not None: + sample_weight = column_or_1d(sample_weight) + + # ensure binary classification if pos_label is not specified + classes = np.unique(y_true) + if (pos_label is None and + not (np.array_equal(classes, [0, 1]) or + np.array_equal(classes, [-1, 1]) or + np.array_equal(classes, [0]) or + np.array_equal(classes, [-1]) or + np.array_equal(classes, [1]))): + raise ValueError("Data is not binary and pos_label is not specified") + elif pos_label is None: + pos_label = 1. + + # make y_true a boolean vector + y_true = (y_true == pos_label) + + # sort scores and corresponding truth values + desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] + y_score = y_score[desc_score_indices] + y_true = y_true[desc_score_indices] + if sample_weight is not None: + weight = sample_weight[desc_score_indices] + else: + weight = 1. + + # y_score typically has many tied values. Here we extract + # the indices associated with the distinct values. We also + # concatenate a value for the end of the curve. + distinct_value_indices = np.where(np.diff(y_score))[0] + threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] + + # accumulate the true positives with decreasing threshold + tps = stable_cumsum(y_true * weight)[threshold_idxs] + if sample_weight is not None: + # express fps as a cumsum to ensure fps is increasing even in + # the presence of floating point errors + fps = stable_cumsum((1 - y_true) * weight)[threshold_idxs] + else: + fps = 1 + threshold_idxs - tps + return fps, tps, y_score[threshold_idxs] + + +def precision_recall_curve(y_true, probas_pred, pos_label=None, + sample_weight=None): + """Compute precision-recall pairs for different probability thresholds + + Note: this implementation is restricted to the binary classification task. + + The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of + true positives and ``fp`` the number of false positives. The precision is + intuitively the ability of the classifier not to label as positive a sample + that is negative. + + The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of + true positives and ``fn`` the number of false negatives. The recall is + intuitively the ability of the classifier to find all the positive samples. + + The last precision and recall values are 1. and 0. respectively and do not + have a corresponding threshold. This ensures that the graph starts on the + y axis. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array, shape = [n_samples] + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + probas_pred : array, shape = [n_samples] + Estimated probabilities or decision function. + + pos_label : int or str, default=None + The label of the positive class. + When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, + ``pos_label`` is set to 1, otherwise an error will be raised. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + precision : array, shape = [n_thresholds + 1] + Precision values such that element i is the precision of + predictions with score >= thresholds[i] and the last element is 1. + + recall : array, shape = [n_thresholds + 1] + Decreasing recall values such that element i is the recall of + predictions with score >= thresholds[i] and the last element is 0. + + thresholds : array, shape = [n_thresholds <= len(np.unique(probas_pred))] + Increasing thresholds on the decision function used to compute + precision and recall. + + See also + -------- + average_precision_score : Compute average precision from prediction scores + + roc_curve : Compute Receiver operating characteristic (ROC) curve + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import precision_recall_curve + >>> y_true = np.array([0, 0, 1, 1]) + >>> y_scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> precision, recall, thresholds = precision_recall_curve( + ... y_true, y_scores) + >>> precision + array([0.66666667, 0.5 , 1. , 1. ]) + >>> recall + array([1. , 0.5, 0.5, 0. ]) + >>> thresholds + array([0.35, 0.4 , 0.8 ]) + + """ + fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, + pos_label=pos_label, + sample_weight=sample_weight) + + precision = tps / (tps + fps) + precision[np.isnan(precision)] = 0 + recall = tps / tps[-1] + + # stop when full recall attained + # and reverse the outputs so recall is decreasing + last_ind = tps.searchsorted(tps[-1]) + sl = slice(last_ind, None, -1) + return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl] + + +def roc_curve(y_true, y_score, pos_label=None, sample_weight=None, + drop_intermediate=True): + """Compute Receiver operating characteristic (ROC) + + Note: this implementation is restricted to the binary classification task. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + + y_true : array, shape = [n_samples] + True binary labels. If labels are not either {-1, 1} or {0, 1}, then + pos_label should be explicitly given. + + y_score : array, shape = [n_samples] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + pos_label : int or str, default=None + The label of the positive class. + When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, + ``pos_label`` is set to 1, otherwise an error will be raised. + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + drop_intermediate : boolean, optional (default=True) + Whether to drop some suboptimal thresholds which would not appear + on a plotted ROC curve. This is useful in order to create lighter + ROC curves. + + .. versionadded:: 0.17 + parameter *drop_intermediate*. + + Returns + ------- + fpr : array, shape = [>2] + Increasing false positive rates such that element i is the false + positive rate of predictions with score >= thresholds[i]. + + tpr : array, shape = [>2] + Increasing true positive rates such that element i is the true + positive rate of predictions with score >= thresholds[i]. + + thresholds : array, shape = [n_thresholds] + Decreasing thresholds on the decision function used to compute + fpr and tpr. `thresholds[0]` represents no instances being predicted + and is arbitrarily set to `max(y_score) + 1`. + + See also + -------- + roc_auc_score : Compute the area under the ROC curve + + Notes + ----- + Since the thresholds are sorted from low to high values, they + are reversed upon returning them to ensure they correspond to both ``fpr`` + and ``tpr``, which are sorted in reversed order during their calculation. + + References + ---------- + .. [1] `Wikipedia entry for the Receiver operating characteristic + `_ + + .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition + Letters, 2006, 27(8):861-874. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import metrics + >>> y = np.array([1, 1, 2, 2]) + >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) + >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) + >>> fpr + array([0. , 0. , 0.5, 0.5, 1. ]) + >>> tpr + array([0. , 0.5, 0.5, 1. , 1. ]) + >>> thresholds + array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ]) + + """ + fps, tps, thresholds = _binary_clf_curve( + y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) + + # Attempt to drop thresholds corresponding to points in between and + # collinear with other points. These are always suboptimal and do not + # appear on a plotted ROC curve (and thus do not affect the AUC). + # Here np.diff(_, 2) is used as a "second derivative" to tell if there + # is a corner at the point. Both fps and tps must be tested to handle + # thresholds with multiple data points (which are combined in + # _binary_clf_curve). This keeps all cases where the point should be kept, + # but does not drop more complicated cases like fps = [1, 3, 7], + # tps = [1, 2, 4]; there is no harm in keeping too many thresholds. + if drop_intermediate and len(fps) > 2: + optimal_idxs = np.where(np.r_[True, + np.logical_or(np.diff(fps, 2), + np.diff(tps, 2)), + True])[0] + fps = fps[optimal_idxs] + tps = tps[optimal_idxs] + thresholds = thresholds[optimal_idxs] + + # Add an extra threshold position + # to make sure that the curve starts at (0, 0) + tps = np.r_[0, tps] + fps = np.r_[0, fps] + thresholds = np.r_[thresholds[0] + 1, thresholds] + + if fps[-1] <= 0: + warnings.warn("No negative samples in y_true, " + "false positive value should be meaningless", + UndefinedMetricWarning) + fpr = np.repeat(np.nan, fps.shape) + else: + fpr = fps / fps[-1] + + if tps[-1] <= 0: + warnings.warn("No positive samples in y_true, " + "true positive value should be meaningless", + UndefinedMetricWarning) + tpr = np.repeat(np.nan, tps.shape) + else: + tpr = tps / tps[-1] + + return fpr, tpr, thresholds + + +def cumulative_gain_curve(y_true, y_score, pos_label=None): + """Compute Cumulative Gain for each ten percent of the sample + Note: This implementation is restricted to the binary classification task. + + Parameters + ---------- + + y_true (array-like, shape (n_samples)): True labels of the data. + y_score (array-like, shape (n_samples)): Target scores, can either be + probability estimates of the positive class, confidence values, or + non-thresholded measure of decisions (as returned by + decision_function on some classifiers). + pos_label (int or str, default=None): Label considered as positive and + others are considered negative + Returns + ------- + percentages (numpy.ndarray): An array containing the X-axis values for + plotting the Cumulative Gains chart. + gains (numpy.ndarray): An array containing the Y-axis values for one + curve of the Cumulative Gains chart. + Raises: + ValueError: If `y_true` is not composed of 2 classes. The Cumulative + Gain Chart is only relevant in binary classification. + + Examples + -------- + >>> import numpy as np + >>> from sklearn import metrics + >>> y_true = [0, 1, 1, 0, 0, 0, 1, 1, 0, 0] + >>> y_pred = [0.1, 0.8, 0.9, 0,3, 0.4, 0.6, 0.6, 0.6, 0.44] + >>> percentages, gains = metrics.cumulative_gain_curve(y_true, y_pred, pos_label=1) + >>> percentages + array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]) + >>> gains + array([0. , 0. , 0.25, 0.5 , 0.5 , 0.75, 1. , 1. , 1. , 1. , 1. ] + + """ + y_true, y_score = np.asarray(y_true), np.asarray(y_score) + + # ensure binary classification if pos_label is not specified + classes = np.unique(y_true) + if (pos_label is None and + not (np.array_equal(classes, [0, 1]) or + np.array_equal(classes, [-1, 1]) or + np.array_equal(classes, [0]) or + np.array_equal(classes, [-1]) or + np.array_equal(classes, [1]))): + raise ValueError("Data is not binary and pos_label is not specified") + elif pos_label is None: + pos_label = 1. + + # make y_true a boolean vector + y_true = (y_true == pos_label) + + sorted_indices = np.argsort(y_score)[::-1] + y_true = y_true[sorted_indices] + gains = np.cumsum(y_true) + + percentages = np.arange(start=1, stop=len(y_true) + 1) + + gains = gains / float(np.sum(y_true)) + percentages = percentages / float(len(y_true)) + + gains = np.insert(gains, 0, [0]) + percentages = np.insert(percentages, 0, [0]) + + return percentages, gains + + +def label_ranking_average_precision_score(y_true, y_score, sample_weight=None): + """Compute ranking-based average precision + + Label ranking average precision (LRAP) is the average over each ground + truth label assigned to each sample, of the ratio of true vs. total + labels with lower score. + + This metric is used in multilabel ranking problem, where the goal + is to give better rank to the labels associated to each sample. + + The obtained score is always strictly greater than 0 and + the best value is 1. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array or sparse matrix, shape = [n_samples, n_labels] + True binary labels in binary indicator format. + + y_score : array, shape = [n_samples, n_labels] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + score : float + + Examples + -------- + >>> import numpy as np + >>> from sklearn.metrics import label_ranking_average_precision_score + >>> y_true = np.array([[1, 0, 0], [0, 0, 1]]) + >>> y_score = np.array([[0.75, 0.5, 1], [1, 0.2, 0.1]]) + >>> label_ranking_average_precision_score(y_true, y_score) + 0.416... + + """ + check_consistent_length(y_true, y_score, sample_weight) + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + # Handle badly formatted array and the degenerate case with one label + y_type = type_of_target(y_true) + if (y_type != "multilabel-indicator" and + not (y_type == "binary" and y_true.ndim == 2)): + raise ValueError("{0} format is not supported".format(y_type)) + + y_true = csr_matrix(y_true) + y_score = -y_score + + n_samples, n_labels = y_true.shape + + out = 0. + for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): + relevant = y_true.indices[start:stop] + + if (relevant.size == 0 or relevant.size == n_labels): + # If all labels are relevant or unrelevant, the score is also + # equal to 1. The label ranking has no meaning. + aux = 1. + else: + scores_i = y_score[i] + rank = rankdata(scores_i, 'max')[relevant] + L = rankdata(scores_i[relevant], 'max') + aux = (L / rank).mean() + + if sample_weight is not None: + aux = aux * sample_weight[i] + out += aux + + if sample_weight is None: + out /= n_samples + else: + out /= np.sum(sample_weight) + + return out + + +def coverage_error(y_true, y_score, sample_weight=None): + """Coverage error measure + + Compute how far we need to go through the ranked scores to cover all + true labels. The best value is equal to the average number + of labels in ``y_true`` per sample. + + Ties in ``y_scores`` are broken by giving maximal rank that would have + been assigned to all tied values. + + Note: Our implementation's score is 1 greater than the one given in + Tsoumakas et al., 2010. This extends it to handle the degenerate case + in which an instance has 0 true labels. + + Read more in the :ref:`User Guide `. + + Parameters + ---------- + y_true : array, shape = [n_samples, n_labels] + True binary labels in binary indicator format. + + y_score : array, shape = [n_samples, n_labels] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + coverage_error : float + + References + ---------- + .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). + Mining multi-label data. In Data mining and knowledge discovery + handbook (pp. 667-685). Springer US. + + """ + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + + y_type = type_of_target(y_true) + if y_type != "multilabel-indicator": + raise ValueError("{0} format is not supported".format(y_type)) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + y_score_mask = np.ma.masked_array(y_score, mask=np.logical_not(y_true)) + y_min_relevant = y_score_mask.min(axis=1).reshape((-1, 1)) + coverage = (y_score >= y_min_relevant).sum(axis=1) + coverage = coverage.filled(0) + + return np.average(coverage, weights=sample_weight) + + +def label_ranking_loss(y_true, y_score, sample_weight=None): + """Compute Ranking loss measure + + Compute the average number of label pairs that are incorrectly ordered + given y_score weighted by the size of the label set and the number of + labels not in the label set. + + This is similar to the error set size, but weighted by the number of + relevant and irrelevant labels. The best performance is achieved with + a ranking loss of zero. + + Read more in the :ref:`User Guide `. + + .. versionadded:: 0.17 + A function *label_ranking_loss* + + Parameters + ---------- + y_true : array or sparse matrix, shape = [n_samples, n_labels] + True binary labels in binary indicator format. + + y_score : array, shape = [n_samples, n_labels] + Target scores, can either be probability estimates of the positive + class, confidence values, or non-thresholded measure of decisions + (as returned by "decision_function" on some classifiers). + + sample_weight : array-like of shape = [n_samples], optional + Sample weights. + + Returns + ------- + loss : float + + References + ---------- + .. [1] Tsoumakas, G., Katakis, I., & Vlahavas, I. (2010). + Mining multi-label data. In Data mining and knowledge discovery + handbook (pp. 667-685). Springer US. + + """ + y_true = check_array(y_true, ensure_2d=False, accept_sparse='csr') + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + + y_type = type_of_target(y_true) + if y_type not in ("multilabel-indicator",): + raise ValueError("{0} format is not supported".format(y_type)) + + if y_true.shape != y_score.shape: + raise ValueError("y_true and y_score have different shape") + + n_samples, n_labels = y_true.shape + + y_true = csr_matrix(y_true) + + loss = np.zeros(n_samples) + for i, (start, stop) in enumerate(zip(y_true.indptr, y_true.indptr[1:])): + # Sort and bin the label scores + unique_scores, unique_inverse = np.unique(y_score[i], + return_inverse=True) + true_at_reversed_rank = np.bincount( + unique_inverse[y_true.indices[start:stop]], + minlength=len(unique_scores)) + all_at_reversed_rank = np.bincount(unique_inverse, + minlength=len(unique_scores)) + false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank + + # if the scores are ordered, it's possible to count the number of + # incorrectly ordered paires in linear time by cumulatively counting + # how many false labels of a given score have a score higher than the + # accumulated true labels with lower score. + loss[i] = np.dot(true_at_reversed_rank.cumsum(), + false_at_reversed_rank) + + n_positives = count_nonzero(y_true, axis=1) + with np.errstate(divide="ignore", invalid="ignore"): + loss /= ((n_labels - n_positives) * n_positives) + + # When there is no positive or no negative labels, those values should + # be consider as correct, i.e. the ranking doesn't matter. + loss[np.logical_or(n_positives == 0, n_positives == n_labels)] = 0. + + return np.average(loss, weights=sample_weight) + + +def _dcg_sample_scores(y_true, y_score, k=None, + log_base=2, ignore_ties=False): + """Compute Discounted Cumulative Gain. + + Sum the true scores ranked in the order induced by the predicted scores, + after applying a logarithmic discount. + + This ranking metric yields a high value if true labels are ranked high by + ``y_score``. + + Parameters + ---------- + y_true : ndarray, shape (n_samples, n_labels) + True targets of multilabel classification, or true scores of entities + to be ranked. + + y_score : ndarray, shape (n_samples, n_labels) + Target scores, can either be probability estimates, confidence values, + or non-thresholded measure of decisions (as returned by + "decision_function" on some classifiers). + + k : int, optional (default=None) + Only consider the highest k scores in the ranking. If None, use all + outputs. + + log_base : float, optional (default=2) + Base of the logarithm used for the discount. A low value means a + sharper discount (top results are more important). + + ignore_ties : bool, optional (default=False) + Assume that there are no ties in y_score (which is likely to be the + case if y_score is continuous) for efficiency gains. + + Returns + ------- + discounted_cumulative_gain : ndarray, shape (n_samples,) + The DCG score for each sample. + + See also + -------- + ndcg_score : + The Discounted Cumulative Gain divided by the Ideal Discounted + Cumulative Gain (the DCG obtained for a perfect ranking), in order to + have a score between 0 and 1. + + """ + discount = 1 / (np.log(np.arange(y_true.shape[1]) + 2) / np.log(log_base)) + if k is not None: + discount[k:] = 0 + if ignore_ties: + ranking = np.argsort(y_score)[:, ::-1] + ranked = y_true[np.arange(ranking.shape[0])[:, np.newaxis], ranking] + cumulative_gains = discount.dot(ranked.T) + else: + discount_cumsum = np.cumsum(discount) + cumulative_gains = [_tie_averaged_dcg(y_t, y_s, discount_cumsum) + for y_t, y_s in zip(y_true, y_score)] + cumulative_gains = np.asarray(cumulative_gains) + return cumulative_gains + + +def _tie_averaged_dcg(y_true, y_score, discount_cumsum): + """ + Compute DCG by averaging over possible permutations of ties. + + The gain (`y_true`) of an index falling inside a tied group (in the order + induced by `y_score`) is replaced by the average gain within this group. + The discounted gain for a tied group is then the average `y_true` within + this group times the sum of discounts of the corresponding ranks. + + This amounts to averaging scores for all possible orderings of the tied + groups. + + (note in the case of dcg@k the discount is 0 after index k) + + Parameters + ---------- + y_true : ndarray + The true relevance scores + + y_score : ndarray + Predicted scores + + discount_cumsum : ndarray + Precomputed cumulative sum of the discounts. + + Returns + ------- + The discounted cumulative gain. + + References + ---------- + McSherry, F., & Najork, M. (2008, March). Computing information retrieval + performance measures efficiently in the presence of tied scores. In + European conference on information retrieval (pp. 414-421). Springer, + Berlin, Heidelberg. + + """ + _, inv, counts = np.unique( + - y_score, return_inverse=True, return_counts=True) + ranked = np.zeros(len(counts)) + np.add.at(ranked, inv, y_true) + ranked /= counts + groups = np.cumsum(counts) - 1 + discount_sums = np.empty(len(counts)) + discount_sums[0] = discount_cumsum[groups[0]] + discount_sums[1:] = np.diff(discount_cumsum[groups]) + return (ranked * discount_sums).sum() + + +def _check_dcg_target_type(y_true): + y_type = type_of_target(y_true) + supported_fmt = ("multilabel-indicator", "continuous-multioutput", + "multiclass-multioutput") + if y_type not in supported_fmt: + raise ValueError( + "Only {} formats are supported. Got {} instead".format( + supported_fmt, y_type)) + + +def dcg_score(y_true, y_score, k=None, + log_base=2, sample_weight=None, ignore_ties=False): + """Compute Discounted Cumulative Gain. + + Sum the true scores ranked in the order induced by the predicted scores, + after applying a logarithmic discount. + + This ranking metric yields a high value if true labels are ranked high by + ``y_score``. + + Usually the Normalized Discounted Cumulative Gain (NDCG, computed by + ndcg_score) is preferred. + + Parameters + ---------- + y_true : ndarray, shape (n_samples, n_labels) + True targets of multilabel classification, or true scores of entities + to be ranked. + + y_score : ndarray, shape (n_samples, n_labels) + Target scores, can either be probability estimates, confidence values, + or non-thresholded measure of decisions (as returned by + "decision_function" on some classifiers). + + k : int, optional (default=None) + Only consider the highest k scores in the ranking. If None, use all + outputs. + + log_base : float, optional (default=2) + Base of the logarithm used for the discount. A low value means a + sharper discount (top results are more important). + + sample_weight : ndarray, shape (n_samples,), optional (default=None) + Sample weights. If None, all samples are given the same weight. + + ignore_ties : bool, optional (default=False) + Assume that there are no ties in y_score (which is likely to be the + case if y_score is continuous) for efficiency gains. + + Returns + ------- + discounted_cumulative_gain : float + The averaged sample DCG scores. + + See also + -------- + ndcg_score : + The Discounted Cumulative Gain divided by the Ideal Discounted + Cumulative Gain (the DCG obtained for a perfect ranking), in order to + have a score between 0 and 1. + + References + ---------- + `Wikipedia entry for Discounted Cumulative Gain + `_ + + Jarvelin, K., & Kekalainen, J. (2002). + Cumulated gain-based evaluation of IR techniques. ACM Transactions on + Information Systems (TOIS), 20(4), 422-446. + + Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May). + A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th + Annual Conference on Learning Theory (COLT 2013) + + McSherry, F., & Najork, M. (2008, March). Computing information retrieval + performance measures efficiently in the presence of tied scores. In + European conference on information retrieval (pp. 414-421). Springer, + Berlin, Heidelberg. + + Examples + -------- + >>> from sklearn.metrics import dcg_score + >>> # we have groud-truth relevance of some answers to a query: + >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) + >>> # we predict scores for the answers + >>> scores = np.asarray([[.1, .2, .3, 4, 70]]) + >>> dcg_score(true_relevance, scores) # doctest: +ELLIPSIS + 9.49... + >>> # we can set k to truncate the sum; only top k answers contribute + >>> dcg_score(true_relevance, scores, k=2) # doctest: +ELLIPSIS + 5.63... + >>> # now we have some ties in our prediction + >>> scores = np.asarray([[1, 0, 0, 0, 1]]) + >>> # by default ties are averaged, so here we get the average true + >>> # relevance of our top predictions: (10 + 5) / 2 = 7.5 + >>> dcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS + 7.5 + >>> # we can choose to ignore ties for faster results, but only + >>> # if we know there aren't ties in our scores, otherwise we get + >>> # wrong results: + >>> dcg_score(true_relevance, + ... scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS + 5.0 + + """ + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + _check_dcg_target_type(y_true) + return np.average( + _dcg_sample_scores( + y_true, y_score, k=k, log_base=log_base, + ignore_ties=ignore_ties), + weights=sample_weight) + + +def _ndcg_sample_scores(y_true, y_score, k=None, ignore_ties=False): + """Compute Normalized Discounted Cumulative Gain. + + Sum the true scores ranked in the order induced by the predicted scores, + after applying a logarithmic discount. Then divide by the best possible + score (Ideal DCG, obtained for a perfect ranking) to obtain a score between + 0 and 1. + + This ranking metric yields a high value if true labels are ranked high by + ``y_score``. + + Parameters + ---------- + y_true : ndarray, shape (n_samples, n_labels) + True targets of multilabel classification, or true scores of entities + to be ranked. + + y_score : ndarray, shape (n_samples, n_labels) + Target scores, can either be probability estimates, confidence values, + or non-thresholded measure of decisions (as returned by + "decision_function" on some classifiers). + + k : int, optional (default=None) + Only consider the highest k scores in the ranking. If None, use all + outputs. + + ignore_ties : bool, optional (default=False) + Assume that there are no ties in y_score (which is likely to be the + case if y_score is continuous) for efficiency gains. + + Returns + ------- + normalized_discounted_cumulative_gain : ndarray, shape (n_samples,) + The NDCG score for each sample (float in [0., 1.]). + + See also + -------- + dcg_score : Discounted Cumulative Gain (not normalized). + + """ + gain = _dcg_sample_scores(y_true, y_score, k, ignore_ties=ignore_ties) + # Here we use the order induced by y_true so we can ignore ties since + # the gain associated to tied indices is the same (permuting ties doesn't + # change the value of the re-ordered y_true) + normalizing_gain = _dcg_sample_scores(y_true, y_true, k, ignore_ties=True) + all_irrelevant = normalizing_gain == 0 + gain[all_irrelevant] = 0 + gain[~all_irrelevant] /= normalizing_gain[~all_irrelevant] + return gain + + +def ndcg_score(y_true, y_score, k=None, sample_weight=None, ignore_ties=False): + """Compute Normalized Discounted Cumulative Gain. + + Sum the true scores ranked in the order induced by the predicted scores, + after applying a logarithmic discount. Then divide by the best possible + score (Ideal DCG, obtained for a perfect ranking) to obtain a score between + 0 and 1. + + This ranking metric yields a high value if true labels are ranked high by + ``y_score``. + + Parameters + ---------- + y_true : ndarray, shape (n_samples, n_labels) + True targets of multilabel classification, or true scores of entities + to be ranked. + + y_score : ndarray, shape (n_samples, n_labels) + Target scores, can either be probability estimates, confidence values, + or non-thresholded measure of decisions (as returned by + "decision_function" on some classifiers). + + k : int, optional (default=None) + Only consider the highest k scores in the ranking. If None, use all + outputs. + + sample_weight : ndarray, shape (n_samples,), optional (default=None) + Sample weights. If None, all samples are given the same weight. + + ignore_ties : bool, optional (default=False) + Assume that there are no ties in y_score (which is likely to be the + case if y_score is continuous) for efficiency gains. + + Returns + ------- + normalized_discounted_cumulative_gain : float in [0., 1.] + The averaged NDCG scores for all samples. + + See also + -------- + dcg_score : Discounted Cumulative Gain (not normalized). + + References + ---------- + `Wikipedia entry for Discounted Cumulative Gain + `_ + + Jarvelin, K., & Kekalainen, J. (2002). + Cumulated gain-based evaluation of IR techniques. ACM Transactions on + Information Systems (TOIS), 20(4), 422-446. + + Wang, Y., Wang, L., Li, Y., He, D., Chen, W., & Liu, T. Y. (2013, May). + A theoretical analysis of NDCG ranking measures. In Proceedings of the 26th + Annual Conference on Learning Theory (COLT 2013) + + McSherry, F., & Najork, M. (2008, March). Computing information retrieval + performance measures efficiently in the presence of tied scores. In + European conference on information retrieval (pp. 414-421). Springer, + Berlin, Heidelberg. + + Examples + -------- + >>> from sklearn.metrics import ndcg_score + >>> # we have groud-truth relevance of some answers to a query: + >>> true_relevance = np.asarray([[10, 0, 0, 1, 5]]) + >>> # we predict some scores (relevance) for the answers + >>> scores = np.asarray([[.1, .2, .3, 4, 70]]) + >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS + 0.69... + >>> scores = np.asarray([[.05, 1.1, 1., .5, .0]]) + >>> ndcg_score(true_relevance, scores) # doctest: +ELLIPSIS + 0.49... + >>> # we can set k to truncate the sum; only top k answers contribute. + >>> ndcg_score(true_relevance, scores, k=4) # doctest: +ELLIPSIS + 0.35... + >>> # the normalization takes k into account so a perfect answer + >>> # would still get 1.0 + >>> ndcg_score(true_relevance, true_relevance, k=4) # doctest: +ELLIPSIS + 1.0 + >>> # now we have some ties in our prediction + >>> scores = np.asarray([[1, 0, 0, 0, 1]]) + >>> # by default ties are averaged, so here we get the average (normalized) + >>> # true relevance of our top predictions: (10 / 10 + 5 / 10) / 2 = .75 + >>> ndcg_score(true_relevance, scores, k=1) # doctest: +ELLIPSIS + 0.75 + >>> # we can choose to ignore ties for faster results, but only + >>> # if we know there aren't ties in our scores, otherwise we get + >>> # wrong results: + >>> ndcg_score(true_relevance, + ... scores, k=1, ignore_ties=True) # doctest: +ELLIPSIS + 0.5 + + """ + y_true = check_array(y_true, ensure_2d=False) + y_score = check_array(y_score, ensure_2d=False) + check_consistent_length(y_true, y_score, sample_weight) + _check_dcg_target_type(y_true) + gain = _ndcg_sample_scores(y_true, y_score, k=k, ignore_ties=ignore_ties) + return np.average(gain, weights=sample_weight) diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 519187ac162c7..8b46e10db683b 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -769,7 +769,11 @@ def test_binary_clf_curve_multiclass_error(curve_func): curve_func(y_true, y_pred) -@pytest.mark.parametrize("curve_func", CURVE_FUNCS) +@pytest.mark.parametrize("curve_func", [ + precision_recall_curve, + roc_curve, + cumulative_gain_curve +]) def test_binary_clf_curve_implicit_pos_label(curve_func): # Check that using string class labels raises an informative # error for any supported string dtype: diff --git a/sklearn/utils/seq_dataset.pxd b/sklearn/utils/seq_dataset.pxd new file mode 100644 index 0000000000000..67ce3b68b4474 --- /dev/null +++ b/sklearn/utils/seq_dataset.pxd @@ -0,0 +1,116 @@ + +#------------------------------------------------------------------------------ + +""" +Dataset abstractions for sequential data access. +WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp +""" + +cimport numpy as np + +# SequentialDataset and its two concrete subclasses are (optionally randomized) +# iterators over the rows of a matrix X and corresponding target values y. + + +cdef class SequentialDataset64: + cdef int current_index + cdef np.ndarray index + cdef int *index_data_ptr + cdef Py_ssize_t n_samples + cdef np.uint32_t seed + + cdef void shuffle(self, np.uint32_t seed) nogil + cdef int _get_next_index(self) nogil + cdef int _get_random_index(self) nogil + + cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight, + int current_index) nogil + cdef void next(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight) nogil + cdef int random(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight) nogil + + +cdef class ArrayDataset64(SequentialDataset64): + cdef np.ndarray X + cdef np.ndarray Y + cdef np.ndarray sample_weights + cdef Py_ssize_t n_features + cdef np.npy_intp X_stride + cdef double *X_data_ptr + cdef double *Y_data_ptr + cdef np.ndarray feature_indices + cdef int *feature_indices_ptr + cdef double *sample_weight_data + + +cdef class CSRDataset64(SequentialDataset64): + cdef np.ndarray X_data + cdef np.ndarray X_indptr + cdef np.ndarray X_indices + cdef np.ndarray Y + cdef np.ndarray sample_weights + cdef double *X_data_ptr + cdef int *X_indptr_ptr + cdef int *X_indices_ptr + cdef double *Y_data_ptr + cdef double *sample_weight_data + +#------------------------------------------------------------------------------ + +""" +Dataset abstractions for sequential data access. +WARNING: Do not edit .pxd file directly, it is generated from .pxd.tp +""" + +cimport numpy as np + +# SequentialDataset and its two concrete subclasses are (optionally randomized) +# iterators over the rows of a matrix X and corresponding target values y. + + +cdef class SequentialDataset32: + cdef int current_index + cdef np.ndarray index + cdef int *index_data_ptr + cdef Py_ssize_t n_samples + cdef np.uint32_t seed + + cdef void shuffle(self, np.uint32_t seed) nogil + cdef int _get_next_index(self) nogil + cdef int _get_random_index(self) nogil + + cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight, + int current_index) nogil + cdef void next(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight) nogil + cdef int random(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight) nogil + + +cdef class ArrayDataset32(SequentialDataset32): + cdef np.ndarray X + cdef np.ndarray Y + cdef np.ndarray sample_weights + cdef Py_ssize_t n_features + cdef np.npy_intp X_stride + cdef float *X_data_ptr + cdef float *Y_data_ptr + cdef np.ndarray feature_indices + cdef int *feature_indices_ptr + cdef float *sample_weight_data + + +cdef class CSRDataset32(SequentialDataset32): + cdef np.ndarray X_data + cdef np.ndarray X_indptr + cdef np.ndarray X_indices + cdef np.ndarray Y + cdef np.ndarray sample_weights + cdef float *X_data_ptr + cdef int *X_indptr_ptr + cdef int *X_indices_ptr + cdef float *Y_data_ptr + cdef float *sample_weight_data diff --git a/sklearn/utils/seq_dataset.pyx b/sklearn/utils/seq_dataset.pyx new file mode 100644 index 0000000000000..6fa274771defe --- /dev/null +++ b/sklearn/utils/seq_dataset.pyx @@ -0,0 +1,653 @@ +# cython: cdivision=True +# cython: boundscheck=False +# cython: wraparound=False + +#------------------------------------------------------------------------------ + +""" +Dataset abstractions for sequential data access. +WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp +""" + +cimport cython +from libc.limits cimport INT_MAX +cimport numpy as np +import numpy as np + +np.import_array() + +from ._random cimport our_rand_r + +cdef class SequentialDataset64: + """Base class for datasets with sequential data access. + + SequentialDataset is used to iterate over the rows of a matrix X and + corresponding target values y, i.e. to iterate over samples. + There are two methods to get the next sample: + - next : Iterate sequentially (optionally randomized) + - random : Iterate randomly (with replacement) + + Attributes + ---------- + index : np.ndarray + Index array for fast shuffling. + + index_data_ptr : int + Pointer to the index array. + + current_index : int + Index of current sample in ``index``. + The index of current sample in the data is given by + index_data_ptr[current_index]. + + n_samples : Py_ssize_t + Number of samples in the dataset. + + seed : np.uint32_t + Seed used for random sampling. + + """ + + cdef void next(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight) nogil: + """Get the next example ``x`` from the dataset. + + This method gets the next sample looping sequentially over all samples. + The order can be shuffled with the method ``shuffle``. + Shuffling once before iterating over all samples corresponds to a + random draw without replacement. It is used for instance in SGD solver. + + Parameters + ---------- + x_data_ptr : double** + A pointer to the double array which holds the feature + values of the next example. + + x_ind_ptr : np.intc** + A pointer to the int array which holds the feature + indices of the next example. + + nnz : int* + A pointer to an int holding the number of non-zero + values of the next example. + + y : double* + The target value of the next example. + + sample_weight : double* + The weight of the next example. + """ + cdef int current_index = self._get_next_index() + self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, + current_index) + + cdef int random(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight) nogil: + """Get a random example ``x`` from the dataset. + + This method gets next sample chosen randomly over a uniform + distribution. It corresponds to a random draw with replacement. + It is used for instance in SAG solver. + + Parameters + ---------- + x_data_ptr : double** + A pointer to the double array which holds the feature + values of the next example. + + x_ind_ptr : np.intc** + A pointer to the int array which holds the feature + indices of the next example. + + nnz : int* + A pointer to an int holding the number of non-zero + values of the next example. + + y : double* + The target value of the next example. + + sample_weight : double* + The weight of the next example. + + Returns + ------- + current_index : int + Index of current sample. + """ + cdef int current_index = self._get_random_index() + self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, + current_index) + return current_index + + cdef void shuffle(self, np.uint32_t seed) nogil: + """Permutes the ordering of examples.""" + # Fisher-Yates shuffle + cdef int *ind = self.index_data_ptr + cdef int n = self.n_samples + cdef unsigned i, j + for i in range(n - 1): + j = i + our_rand_r(&seed) % (n - i) + ind[i], ind[j] = ind[j], ind[i] + + cdef int _get_next_index(self) nogil: + cdef int current_index = self.current_index + if current_index >= (self.n_samples - 1): + current_index = -1 + + current_index += 1 + self.current_index = current_index + return self.current_index + + cdef int _get_random_index(self) nogil: + cdef int n = self.n_samples + cdef int current_index = our_rand_r(&self.seed) % n + self.current_index = current_index + return current_index + + cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight, + int current_index) nogil: + pass + + def _shuffle_py(self, np.uint32_t seed): + """python function used for easy testing""" + self.shuffle(seed) + + def _next_py(self): + """python function used for easy testing""" + cdef int current_index = self._get_next_index() + return self._sample_py(current_index) + + def _random_py(self): + """python function used for easy testing""" + cdef int current_index = self._get_random_index() + return self._sample_py(current_index) + + def _sample_py(self, int current_index): + """python function used for easy testing""" + cdef double* x_data_ptr + cdef int* x_indices_ptr + cdef int nnz, j + cdef double y, sample_weight + + # call _sample in cython + self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight, + current_index) + + # transform the pointed data in numpy CSR array + cdef np.ndarray[double, ndim=1] x_data = np.empty(nnz, + dtype=np.float64) + cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32) + cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz], + dtype=np.int32) + + for j in range(nnz): + x_data[j] = x_data_ptr[j] + x_indices[j] = x_indices_ptr[j] + + cdef int sample_idx = self.index_data_ptr[current_index] + + return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx + + +cdef class ArrayDataset64(SequentialDataset64): + """Dataset backed by a two-dimensional numpy array. + + The dtype of the numpy array is expected to be ``np.float64`` (double) + and C-style memory layout. + """ + + def __cinit__(self, np.ndarray[double, ndim=2, mode='c'] X, + np.ndarray[double, ndim=1, mode='c'] Y, + np.ndarray[double, ndim=1, mode='c'] sample_weights, + np.uint32_t seed=1): + """A ``SequentialDataset`` backed by a two-dimensional numpy array. + + Parameters + ---------- + X : ndarray, dtype=double, ndim=2, mode='c' + The sample array, of shape(n_samples, n_features) + + Y : ndarray, dtype=double, ndim=1, mode='c' + The target array, of shape(n_samples, ) + + sample_weights : ndarray, dtype=double, ndim=1, mode='c' + The weight of each sample, of shape(n_samples,) + """ + if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX: + raise ValueError("More than %d samples or features not supported;" + " got (%d, %d)." + % (INT_MAX, X.shape[0], X.shape[1])) + + # keep a reference to the data to prevent garbage collection + self.X = X + self.Y = Y + self.sample_weights = sample_weights + + self.n_samples = X.shape[0] + self.n_features = X.shape[1] + + cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \ + np.arange(0, self.n_features, dtype=np.intc) + self.feature_indices = feature_indices + self.feature_indices_ptr = feature_indices.data + + self.current_index = -1 + self.X_stride = X.strides[0] / X.itemsize + self.X_data_ptr = X.data + self.Y_data_ptr = Y.data + self.sample_weight_data = sample_weights.data + + # Use index array for fast shuffling + cdef np.ndarray[int, ndim=1, mode='c'] index = \ + np.arange(0, self.n_samples, dtype=np.intc) + self.index = index + self.index_data_ptr = index.data + # seed should not be 0 for our_rand_r + self.seed = max(seed, 1) + + cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight, + int current_index) nogil: + cdef long long sample_idx = self.index_data_ptr[current_index] + cdef long long offset = sample_idx * self.X_stride + + y[0] = self.Y_data_ptr[sample_idx] + x_data_ptr[0] = self.X_data_ptr + offset + x_ind_ptr[0] = self.feature_indices_ptr + nnz[0] = self.n_features + sample_weight[0] = self.sample_weight_data[sample_idx] + + +cdef class CSRDataset64(SequentialDataset64): + """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ + + def __cinit__(self, np.ndarray[double, ndim=1, mode='c'] X_data, + np.ndarray[int, ndim=1, mode='c'] X_indptr, + np.ndarray[int, ndim=1, mode='c'] X_indices, + np.ndarray[double, ndim=1, mode='c'] Y, + np.ndarray[double, ndim=1, mode='c'] sample_weights, + np.uint32_t seed=1): + """Dataset backed by a scipy sparse CSR matrix. + + The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. + The corresponding feature values are given by + x_data_ptr[0:nnz]. + + Parameters + ---------- + X_data : ndarray, dtype=double, ndim=1, mode='c' + The data array of the CSR features matrix. + + X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c' + The index pointer array of the CSR features matrix. + + X_indices : ndarray, dtype=np.intc, ndim=1, mode='c' + The column indices array of the CSR features matrix. + + Y : ndarray, dtype=double, ndim=1, mode='c' + The target values. + + sample_weights : ndarray, dtype=double, ndim=1, mode='c' + The weight of each sample. + """ + # keep a reference to the data to prevent garbage collection + self.X_data = X_data + self.X_indptr = X_indptr + self.X_indices = X_indices + self.Y = Y + self.sample_weights = sample_weights + + self.n_samples = Y.shape[0] + self.current_index = -1 + self.X_data_ptr = X_data.data + self.X_indptr_ptr = X_indptr.data + self.X_indices_ptr = X_indices.data + + self.Y_data_ptr = Y.data + self.sample_weight_data = sample_weights.data + + # Use index array for fast shuffling + cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples, + dtype=np.intc) + self.index = idx + self.index_data_ptr = idx.data + # seed should not be 0 for our_rand_r + self.seed = max(seed, 1) + + cdef void _sample(self, double **x_data_ptr, int **x_ind_ptr, + int *nnz, double *y, double *sample_weight, + int current_index) nogil: + cdef long long sample_idx = self.index_data_ptr[current_index] + cdef long long offset = self.X_indptr_ptr[sample_idx] + y[0] = self.Y_data_ptr[sample_idx] + x_data_ptr[0] = self.X_data_ptr + offset + x_ind_ptr[0] = self.X_indices_ptr + offset + nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset + sample_weight[0] = self.sample_weight_data[sample_idx] + + +#------------------------------------------------------------------------------ + +""" +Dataset abstractions for sequential data access. +WARNING: Do not edit .pyx file directly, it is generated from .pyx.tp +""" + +cimport cython +from libc.limits cimport INT_MAX +cimport numpy as np +import numpy as np + +np.import_array() + +from ._random cimport our_rand_r + +cdef class SequentialDataset32: + """Base class for datasets with sequential data access. + + SequentialDataset is used to iterate over the rows of a matrix X and + corresponding target values y, i.e. to iterate over samples. + There are two methods to get the next sample: + - next : Iterate sequentially (optionally randomized) + - random : Iterate randomly (with replacement) + + Attributes + ---------- + index : np.ndarray + Index array for fast shuffling. + + index_data_ptr : int + Pointer to the index array. + + current_index : int + Index of current sample in ``index``. + The index of current sample in the data is given by + index_data_ptr[current_index]. + + n_samples : Py_ssize_t + Number of samples in the dataset. + + seed : np.uint32_t + Seed used for random sampling. + + """ + + cdef void next(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight) nogil: + """Get the next example ``x`` from the dataset. + + This method gets the next sample looping sequentially over all samples. + The order can be shuffled with the method ``shuffle``. + Shuffling once before iterating over all samples corresponds to a + random draw without replacement. It is used for instance in SGD solver. + + Parameters + ---------- + x_data_ptr : float** + A pointer to the float array which holds the feature + values of the next example. + + x_ind_ptr : np.intc** + A pointer to the int array which holds the feature + indices of the next example. + + nnz : int* + A pointer to an int holding the number of non-zero + values of the next example. + + y : float* + The target value of the next example. + + sample_weight : float* + The weight of the next example. + """ + cdef int current_index = self._get_next_index() + self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, + current_index) + + cdef int random(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight) nogil: + """Get a random example ``x`` from the dataset. + + This method gets next sample chosen randomly over a uniform + distribution. It corresponds to a random draw with replacement. + It is used for instance in SAG solver. + + Parameters + ---------- + x_data_ptr : float** + A pointer to the float array which holds the feature + values of the next example. + + x_ind_ptr : np.intc** + A pointer to the int array which holds the feature + indices of the next example. + + nnz : int* + A pointer to an int holding the number of non-zero + values of the next example. + + y : float* + The target value of the next example. + + sample_weight : float* + The weight of the next example. + + Returns + ------- + current_index : int + Index of current sample. + """ + cdef int current_index = self._get_random_index() + self._sample(x_data_ptr, x_ind_ptr, nnz, y, sample_weight, + current_index) + return current_index + + cdef void shuffle(self, np.uint32_t seed) nogil: + """Permutes the ordering of examples.""" + # Fisher-Yates shuffle + cdef int *ind = self.index_data_ptr + cdef int n = self.n_samples + cdef unsigned i, j + for i in range(n - 1): + j = i + our_rand_r(&seed) % (n - i) + ind[i], ind[j] = ind[j], ind[i] + + cdef int _get_next_index(self) nogil: + cdef int current_index = self.current_index + if current_index >= (self.n_samples - 1): + current_index = -1 + + current_index += 1 + self.current_index = current_index + return self.current_index + + cdef int _get_random_index(self) nogil: + cdef int n = self.n_samples + cdef int current_index = our_rand_r(&self.seed) % n + self.current_index = current_index + return current_index + + cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight, + int current_index) nogil: + pass + + def _shuffle_py(self, np.uint32_t seed): + """python function used for easy testing""" + self.shuffle(seed) + + def _next_py(self): + """python function used for easy testing""" + cdef int current_index = self._get_next_index() + return self._sample_py(current_index) + + def _random_py(self): + """python function used for easy testing""" + cdef int current_index = self._get_random_index() + return self._sample_py(current_index) + + def _sample_py(self, int current_index): + """python function used for easy testing""" + cdef float* x_data_ptr + cdef int* x_indices_ptr + cdef int nnz, j + cdef float y, sample_weight + + # call _sample in cython + self._sample(&x_data_ptr, &x_indices_ptr, &nnz, &y, &sample_weight, + current_index) + + # transform the pointed data in numpy CSR array + cdef np.ndarray[float, ndim=1] x_data = np.empty(nnz, + dtype=np.float32) + cdef np.ndarray[int, ndim=1] x_indices = np.empty(nnz, dtype=np.int32) + cdef np.ndarray[int, ndim=1] x_indptr = np.asarray([0, nnz], + dtype=np.int32) + + for j in range(nnz): + x_data[j] = x_data_ptr[j] + x_indices[j] = x_indices_ptr[j] + + cdef int sample_idx = self.index_data_ptr[current_index] + + return (x_data, x_indices, x_indptr), y, sample_weight, sample_idx + + +cdef class ArrayDataset32(SequentialDataset32): + """Dataset backed by a two-dimensional numpy array. + + The dtype of the numpy array is expected to be ``np.float32`` (float) + and C-style memory layout. + """ + + def __cinit__(self, np.ndarray[float, ndim=2, mode='c'] X, + np.ndarray[float, ndim=1, mode='c'] Y, + np.ndarray[float, ndim=1, mode='c'] sample_weights, + np.uint32_t seed=1): + """A ``SequentialDataset`` backed by a two-dimensional numpy array. + + Parameters + ---------- + X : ndarray, dtype=float, ndim=2, mode='c' + The sample array, of shape(n_samples, n_features) + + Y : ndarray, dtype=float, ndim=1, mode='c' + The target array, of shape(n_samples, ) + + sample_weights : ndarray, dtype=float, ndim=1, mode='c' + The weight of each sample, of shape(n_samples,) + """ + if X.shape[0] > INT_MAX or X.shape[1] > INT_MAX: + raise ValueError("More than %d samples or features not supported;" + " got (%d, %d)." + % (INT_MAX, X.shape[0], X.shape[1])) + + # keep a reference to the data to prevent garbage collection + self.X = X + self.Y = Y + self.sample_weights = sample_weights + + self.n_samples = X.shape[0] + self.n_features = X.shape[1] + + cdef np.ndarray[int, ndim=1, mode='c'] feature_indices = \ + np.arange(0, self.n_features, dtype=np.intc) + self.feature_indices = feature_indices + self.feature_indices_ptr = feature_indices.data + + self.current_index = -1 + self.X_stride = X.strides[0] / X.itemsize + self.X_data_ptr = X.data + self.Y_data_ptr = Y.data + self.sample_weight_data = sample_weights.data + + # Use index array for fast shuffling + cdef np.ndarray[int, ndim=1, mode='c'] index = \ + np.arange(0, self.n_samples, dtype=np.intc) + self.index = index + self.index_data_ptr = index.data + # seed should not be 0 for our_rand_r + self.seed = max(seed, 1) + + cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight, + int current_index) nogil: + cdef long long sample_idx = self.index_data_ptr[current_index] + cdef long long offset = sample_idx * self.X_stride + + y[0] = self.Y_data_ptr[sample_idx] + x_data_ptr[0] = self.X_data_ptr + offset + x_ind_ptr[0] = self.feature_indices_ptr + nnz[0] = self.n_features + sample_weight[0] = self.sample_weight_data[sample_idx] + + +cdef class CSRDataset32(SequentialDataset32): + """A ``SequentialDataset`` backed by a scipy sparse CSR matrix. """ + + def __cinit__(self, np.ndarray[float, ndim=1, mode='c'] X_data, + np.ndarray[int, ndim=1, mode='c'] X_indptr, + np.ndarray[int, ndim=1, mode='c'] X_indices, + np.ndarray[float, ndim=1, mode='c'] Y, + np.ndarray[float, ndim=1, mode='c'] sample_weights, + np.uint32_t seed=1): + """Dataset backed by a scipy sparse CSR matrix. + + The feature indices of ``x`` are given by x_ind_ptr[0:nnz]. + The corresponding feature values are given by + x_data_ptr[0:nnz]. + + Parameters + ---------- + X_data : ndarray, dtype=float, ndim=1, mode='c' + The data array of the CSR features matrix. + + X_indptr : ndarray, dtype=np.intc, ndim=1, mode='c' + The index pointer array of the CSR features matrix. + + X_indices : ndarray, dtype=np.intc, ndim=1, mode='c' + The column indices array of the CSR features matrix. + + Y : ndarray, dtype=float, ndim=1, mode='c' + The target values. + + sample_weights : ndarray, dtype=float, ndim=1, mode='c' + The weight of each sample. + """ + # keep a reference to the data to prevent garbage collection + self.X_data = X_data + self.X_indptr = X_indptr + self.X_indices = X_indices + self.Y = Y + self.sample_weights = sample_weights + + self.n_samples = Y.shape[0] + self.current_index = -1 + self.X_data_ptr = X_data.data + self.X_indptr_ptr = X_indptr.data + self.X_indices_ptr = X_indices.data + + self.Y_data_ptr = Y.data + self.sample_weight_data = sample_weights.data + + # Use index array for fast shuffling + cdef np.ndarray[int, ndim=1, mode='c'] idx = np.arange(self.n_samples, + dtype=np.intc) + self.index = idx + self.index_data_ptr = idx.data + # seed should not be 0 for our_rand_r + self.seed = max(seed, 1) + + cdef void _sample(self, float **x_data_ptr, int **x_ind_ptr, + int *nnz, float *y, float *sample_weight, + int current_index) nogil: + cdef long long sample_idx = self.index_data_ptr[current_index] + cdef long long offset = self.X_indptr_ptr[sample_idx] + y[0] = self.Y_data_ptr[sample_idx] + x_data_ptr[0] = self.X_data_ptr + offset + x_ind_ptr[0] = self.X_indices_ptr + offset + nnz[0] = self.X_indptr_ptr[sample_idx + 1] - offset + sample_weight[0] = self.sample_weight_data[sample_idx] + From 6e344a2be1d5ecd1ff6dc1a313dfc65453f8ea76 Mon Sep 17 00:00:00 2001 From: tber16_atl Date: Wed, 22 Dec 2021 10:49:59 +0100 Subject: [PATCH 3/6] Update upstream to the forked repo --- sklearn/metrics/__init__.py | 82 +++++++++++++++++++++++++++++++++++++ sklearn/metrics/_ranking.py | 5 ++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index 20cb6ed9404b1..f6295550770e1 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -97,6 +97,7 @@ __all__ = [ +<<<<<<< HEAD "accuracy_score", "adjusted_mutual_info_score", "adjusted_rand_score", @@ -177,4 +178,85 @@ "v_measure_score", "zero_one_loss", "brier_score_loss", +======= + 'accuracy_score', + 'adjusted_mutual_info_score', + 'adjusted_rand_score', + 'auc', + 'average_precision_score', + 'balanced_accuracy_score', + 'calinski_harabasz_score', + 'check_scoring', + 'classification_report', + 'cluster', + 'cohen_kappa_score', + 'completeness_score', + 'ConfusionMatrixDisplay', + 'confusion_matrix', + 'consensus_score', + 'coverage_error', + 'dcg_score', + 'davies_bouldin_score', + 'DetCurveDisplay', + 'det_curve', + 'euclidean_distances', + 'explained_variance_score', + 'f1_score', + 'fbeta_score', + 'fowlkes_mallows_score', + 'get_scorer', + 'hamming_loss', + 'hinge_loss', + 'homogeneity_completeness_v_measure', + 'homogeneity_score', + 'jaccard_score', + 'label_ranking_average_precision_score', + 'label_ranking_loss', + 'log_loss', + 'make_scorer', + 'nan_euclidean_distances', + 'matthews_corrcoef', + 'max_error', + 'mean_absolute_error', + 'mean_squared_error', + 'mean_squared_log_error', + 'mean_pinball_loss', + 'mean_poisson_deviance', + 'mean_gamma_deviance', + 'mean_tweedie_deviance', + 'median_absolute_error', + 'mean_absolute_percentage_error', + 'multilabel_confusion_matrix', + 'mutual_info_score', + 'ndcg_score', + 'normalized_mutual_info_score', + 'pair_confusion_matrix', + 'pairwise_distances', + 'pairwise_distances_argmin', + 'pairwise_distances_argmin_min', + 'pairwise_distances_chunked', + 'pairwise_kernels', + 'plot_confusion_matrix', + 'plot_det_curve', + 'plot_precision_recall_curve', + 'plot_roc_curve', + 'PrecisionRecallDisplay', + 'precision_recall_curve', + 'precision_recall_fscore_support', + 'precision_score', + 'r2_score', + 'rand_score', + 'recall_score', + 'RocCurveDisplay', + 'roc_auc_score', + 'roc_curve', + 'cumulative_gain_curve', + 'SCORERS', + 'silhouette_samples', + 'silhouette_score', + 'top_k_accuracy_score', + 'v_measure_score', + 'zero_one_loss', + 'brier_score_loss', +>>>>>>> fa66ae8c2 (FEA cumulative_gain_curve correct syntax) ] diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index f3734b85ad261..cd2b4c704e701 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -1005,9 +1005,11 @@ def roc_curve( tpr = tps / tps[-1] return fpr, tpr, thresholds + +@_deprecate_positional_args def cumulative_gain_curve(y_true, y_score, pos_label=None): - """This function generates the points necessary to plot the Cumulative Gain for each ten percent of the samples + """Compute Cumulative Gain for each ten percent of the sample Note: This implementation is restricted to the binary classification task. Parameters @@ -1073,7 +1075,6 @@ def cumulative_gain_curve(y_true, y_score, pos_label=None): percentages = np.insert(percentages, 0, [0]) return percentages, gains - def label_ranking_average_precision_score(y_true, y_score, *, sample_weight=None): From d538225633a54d9f52f68fc7f6aee0463f41d7ac Mon Sep 17 00:00:00 2001 From: Ali TBER Date: Tue, 2 Mar 2021 09:11:12 +0100 Subject: [PATCH 4/6] Another syntax correction --- sklearn/metrics/_ranking.py | 8 ++++---- sklearn/metrics/ranking.py | 11 ++++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index cd2b4c704e701..e60f0ab0d2a2e 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -1005,13 +1005,13 @@ def roc_curve( tpr = tps / tps[-1] return fpr, tpr, thresholds - + @_deprecate_positional_args def cumulative_gain_curve(y_true, y_score, pos_label=None): """Compute Cumulative Gain for each ten percent of the sample Note: This implementation is restricted to the binary classification task. - + Parameters ---------- @@ -1035,10 +1035,10 @@ def cumulative_gain_curve(y_true, y_score, pos_label=None): Examples -------- >>> import numpy as np - >>> from sklearn import metrics + >>> from sklearn.metrics import cumulative_gain_curve >>> y_true = [0, 1, 1, 0, 0, 0, 1, 1, 0, 0] >>> y_pred = [0.1, 0.8, 0.9, 0,3, 0.4, 0.6, 0.6, 0.6, 0.44] - >>> percentages, gains = metrics.cumulative_gain_curve(y_true, y_pred, pos_label=1) + >>> percentages, gains = cumulative_gain_curve(y_true, y_pred, pos_label=1) >>> percentages array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]) >>> gains diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py index 73dabf40ab5e5..4889bfc4f3b86 100644 --- a/sklearn/metrics/ranking.py +++ b/sklearn/metrics/ranking.py @@ -787,7 +787,7 @@ def roc_curve(y_true, y_score, pos_label=None, sample_weight=None, def cumulative_gain_curve(y_true, y_score, pos_label=None): """Compute Cumulative Gain for each ten percent of the sample Note: This implementation is restricted to the binary classification task. - + Parameters ---------- @@ -811,10 +811,10 @@ def cumulative_gain_curve(y_true, y_score, pos_label=None): Examples -------- >>> import numpy as np - >>> from sklearn import metrics + >>> from sklearn.metrics import cumulative_gain_curve >>> y_true = [0, 1, 1, 0, 0, 0, 1, 1, 0, 0] >>> y_pred = [0.1, 0.8, 0.9, 0,3, 0.4, 0.6, 0.6, 0.6, 0.44] - >>> percentages, gains = metrics.cumulative_gain_curve(y_true, y_pred, pos_label=1) + >>> percentages, gains = cumulative_gain_curve(y_true, y_pred, pos_label=1) >>> percentages array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ]) >>> gains @@ -1061,8 +1061,9 @@ def label_ranking_loss(y_true, y_score, sample_weight=None): true_at_reversed_rank = np.bincount( unique_inverse[y_true.indices[start:stop]], minlength=len(unique_scores)) - all_at_reversed_rank = np.bincount(unique_inverse, - minlength=len(unique_scores)) + all_at_reversed_rank = np.bincount( + unique_inverse, + minlength=len(unique_scores)) false_at_reversed_rank = all_at_reversed_rank - true_at_reversed_rank # if the scores are ordered, it's possible to count the number of From 772919fd6cc4ffe45f9521fa45cd162af4f82dea Mon Sep 17 00:00:00 2001 From: tber16_atl Date: Wed, 22 Dec 2021 11:04:43 +0100 Subject: [PATCH 5/6] Fix linting error --- sklearn/metrics/__init__.py | 85 +------------------------------------ 1 file changed, 1 insertion(+), 84 deletions(-) diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py index f6295550770e1..a92f38151b71a 100644 --- a/sklearn/metrics/__init__.py +++ b/sklearn/metrics/__init__.py @@ -97,88 +97,6 @@ __all__ = [ -<<<<<<< HEAD - "accuracy_score", - "adjusted_mutual_info_score", - "adjusted_rand_score", - "auc", - "average_precision_score", - "balanced_accuracy_score", - "calinski_harabasz_score", - "check_scoring", - "classification_report", - "cluster", - "cohen_kappa_score", - "completeness_score", - "ConfusionMatrixDisplay", - "confusion_matrix", - "consensus_score", - "coverage_error", - "d2_tweedie_score", - "dcg_score", - "davies_bouldin_score", - "DetCurveDisplay", - "det_curve", - "DistanceMetric", - "euclidean_distances", - "explained_variance_score", - "f1_score", - "fbeta_score", - "fowlkes_mallows_score", - "get_scorer", - "hamming_loss", - "hinge_loss", - "homogeneity_completeness_v_measure", - "homogeneity_score", - "jaccard_score", - "label_ranking_average_precision_score", - "label_ranking_loss", - "log_loss", - "make_scorer", - "nan_euclidean_distances", - "matthews_corrcoef", - "max_error", - "mean_absolute_error", - "mean_squared_error", - "mean_squared_log_error", - "mean_pinball_loss", - "mean_poisson_deviance", - "mean_gamma_deviance", - "mean_tweedie_deviance", - "median_absolute_error", - "mean_absolute_percentage_error", - "multilabel_confusion_matrix", - "mutual_info_score", - "ndcg_score", - "normalized_mutual_info_score", - "pair_confusion_matrix", - "pairwise_distances", - "pairwise_distances_argmin", - "pairwise_distances_argmin_min", - "pairwise_distances_chunked", - "pairwise_kernels", - "plot_confusion_matrix", - "plot_det_curve", - "plot_precision_recall_curve", - "plot_roc_curve", - "PrecisionRecallDisplay", - "precision_recall_curve", - "precision_recall_fscore_support", - "precision_score", - "r2_score", - "rand_score", - "recall_score", - "RocCurveDisplay", - "roc_auc_score", - "roc_curve", - "SCORERS", - "silhouette_samples", - "silhouette_score", - "top_k_accuracy_score", - "v_measure_score", - "zero_one_loss", - "brier_score_loss", -======= 'accuracy_score', 'adjusted_mutual_info_score', 'adjusted_rand_score', @@ -195,6 +113,7 @@ 'confusion_matrix', 'consensus_score', 'coverage_error', + 'cumulative_gain_curve', 'dcg_score', 'davies_bouldin_score', 'DetCurveDisplay', @@ -250,7 +169,6 @@ 'RocCurveDisplay', 'roc_auc_score', 'roc_curve', - 'cumulative_gain_curve', 'SCORERS', 'silhouette_samples', 'silhouette_score', @@ -258,5 +176,4 @@ 'v_measure_score', 'zero_one_loss', 'brier_score_loss', ->>>>>>> fa66ae8c2 (FEA cumulative_gain_curve correct syntax) ] From a9912df17b42eb003a7e41c40423061c864090d4 Mon Sep 17 00:00:00 2001 From: Alit10 Date: Wed, 22 Dec 2021 11:14:10 +0100 Subject: [PATCH 6/6] Fix linting error2 --- sklearn/metrics/_ranking.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/metrics/_ranking.py b/sklearn/metrics/_ranking.py index e60f0ab0d2a2e..27263790b90e4 100644 --- a/sklearn/metrics/_ranking.py +++ b/sklearn/metrics/_ranking.py @@ -1007,7 +1007,6 @@ def roc_curve( return fpr, tpr, thresholds -@_deprecate_positional_args def cumulative_gain_curve(y_true, y_score, pos_label=None): """Compute Cumulative Gain for each ten percent of the sample Note: This implementation is restricted to the binary classification task.