diff --git a/sklearn/covariance/graph_lasso_.py b/sklearn/covariance/graph_lasso_.py index fba26a9ffa5cc..2965f4e18ca60 100644 --- a/sklearn/covariance/graph_lasso_.py +++ b/sklearn/covariance/graph_lasso_.py @@ -20,7 +20,8 @@ from ..utils.extmath import pinvh from ..linear_model import lars_path from ..linear_model import cd_fast -from ..cross_validation import _check_cv as check_cv, cross_val_score +from ..model_selection.partition import _check_cv as check_cv +from ..model_selection import cross_val_score from ..externals.joblib import Parallel, delayed import collections @@ -388,7 +389,7 @@ def graph_lasso_path(X, alphas, cov_init=None, X_test=None, mode='cd', class GraphLassoCV(GraphLasso): - """Sparse inverse covariance w/ cross-validated choice of the l1 penalty + """Sparse inverse covariance w/ cross-from .partition import _check_cvvalidated choice of the l1 penalty Parameters ---------- diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index c5206a4fd5239..28168b645b3fa 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -12,22 +12,15 @@ from __future__ import division import warnings -from itertools import chain, combinations -from math import ceil, floor, factorial +from math import ceil import numbers -import time -from abc import ABCMeta, abstractmethod -import numpy as np -import scipy.sparse as sp +from .utils import check_random_state -from .base import is_classifier, clone -from .utils import check_arrays, check_random_state, safe_mask -from .utils.validation import _num_samples -from .externals.joblib import Parallel, delayed, logger -from .externals.six import with_metaclass -from .externals.six.moves import zip -from .metrics.scorer import check_scoring +from .model_selection.partition import LeaveOneOut, LeavePOut, KFold, \ + StratifiedKFold, LeaveOneLabelOut, LeavePLabelOut, \ + ShuffleSplit, StratifiedShuffleSplit, train_test_split, check_cv +from .model_selection.validate import cross_val_score, permutation_test_score __all__ = ['Bootstrap', 'KFold', @@ -44,566 +37,8 @@ 'train_test_split'] -class _PartitionIterator(with_metaclass(ABCMeta)): - """Base class for CV iterators where train_mask = ~test_mask - - Implementations must define `_iter_test_masks` or `_iter_test_indices`. - - Parameters - ---------- - n : int - Total number of elements in dataset. - """ - - def __init__(self, n, indices=None): - if indices is None: - indices = True - else: - warnings.warn("The indices parameter is deprecated and will be " - "removed (assumed True) in 0.17", DeprecationWarning, - stacklevel=1) - if abs(n - int(n)) >= np.finfo('f').eps: - raise ValueError("n must be an integer") - self.n = int(n) - self._indices = indices - - @property - def indices(self): - warnings.warn("The indices attribute is deprecated and will be " - "removed (assumed True) in 0.17", DeprecationWarning, - stacklevel=1) - return self._indices - - def __iter__(self): - indices = self._indices - if indices: - ind = np.arange(self.n) - for test_index in self._iter_test_masks(): - train_index = np.logical_not(test_index) - if indices: - train_index = ind[train_index] - test_index = ind[test_index] - yield train_index, test_index - - # Since subclasses must implement either _iter_test_masks or - # _iter_test_indices, neither can be abstract. - def _iter_test_masks(self): - """Generates boolean masks corresponding to test sets. - - By default, delegates to _iter_test_indices() - """ - for test_index in self._iter_test_indices(): - test_mask = self._empty_mask() - test_mask[test_index] = True - yield test_mask - - def _iter_test_indices(self): - """Generates integer indices corresponding to test sets.""" - raise NotImplementedError - - def _empty_mask(self): - return np.zeros(self.n, dtype=np.bool) - - -class LeaveOneOut(_PartitionIterator): - """Leave-One-Out cross validation iterator. - - Provides train/test indices to split data in train test sets. Each - sample is used once as a test set (singleton) while the remaining - samples form the training set. - - Note: ``LeaveOneOut(n)`` is equivalent to ``KFold(n, n_folds=n)`` and - ``LeavePOut(n, p=1)``. - - Due to the high number of test sets (which is the same as the - number of samples) this cross validation method can be very costly. - For large datasets one should favor KFold, StratifiedKFold or - ShuffleSplit. - - Parameters - ---------- - n : int - Total number of elements in dataset. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4]]) - >>> y = np.array([1, 2]) - >>> loo = cross_validation.LeaveOneOut(2) - >>> len(loo) - 2 - >>> print(loo) - sklearn.cross_validation.LeaveOneOut(n=2) - >>> for train_index, test_index in loo: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) - TRAIN: [1] TEST: [0] - [[3 4]] [[1 2]] [2] [1] - TRAIN: [0] TEST: [1] - [[1 2]] [[3 4]] [1] [2] - - See also - -------- - LeaveOneLabelOut for splitting the data according to explicit, - domain-specific stratification of the dataset. - """ - - def _iter_test_indices(self): - return range(self.n) - - def __repr__(self): - return '%s.%s(n=%i)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.n, - ) - - def __len__(self): - return self.n - - -class LeavePOut(_PartitionIterator): - """Leave-P-Out cross validation iterator - - Provides train/test indices to split data in train test sets. This results - in testing on all distinct samples of size p, while the remaining n - p - samples form the training set in each iteration. - - Note: ``LeavePOut(n, p)`` is NOT equivalent to ``KFold(n, n_folds=n // p)`` - which creates non-overlapping test sets. - - Due to the high number of iterations which grows combinatorically with the - number of samples this cross validation method can be very costly. For - large datasets one should favor KFold, StratifiedKFold or ShuffleSplit. - - Parameters - ---------- - n : int - Total number of elements in dataset. - - p : int - Size of the test sets. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - >>> y = np.array([1, 2, 3, 4]) - >>> lpo = cross_validation.LeavePOut(4, 2) - >>> len(lpo) - 6 - >>> print(lpo) - sklearn.cross_validation.LeavePOut(n=4, p=2) - >>> for train_index, test_index in lpo: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [2 3] TEST: [0 1] - TRAIN: [1 3] TEST: [0 2] - TRAIN: [1 2] TEST: [0 3] - TRAIN: [0 3] TEST: [1 2] - TRAIN: [0 2] TEST: [1 3] - TRAIN: [0 1] TEST: [2 3] - """ - - def __init__(self, n, p, indices=None): - super(LeavePOut, self).__init__(n, indices) - self.p = p - - def _iter_test_indices(self): - for comb in combinations(range(self.n), self.p): - yield np.array(comb) - - def __repr__(self): - return '%s.%s(n=%i, p=%i)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.n, - self.p, - ) - - def __len__(self): - return int(factorial(self.n) / factorial(self.n - self.p) - / factorial(self.p)) - - -class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): - """Base class to validate KFold approaches""" - - @abstractmethod - def __init__(self, n, n_folds, indices, shuffle, random_state): - super(_BaseKFold, self).__init__(n, indices) - - if abs(n_folds - int(n_folds)) >= np.finfo('f').eps: - raise ValueError("n_folds must be an integer") - self.n_folds = n_folds = int(n_folds) - - if n_folds <= 1: - raise ValueError( - "k-fold cross validation requires at least one" - " train / test split by setting n_folds=2 or more," - " got n_folds={0}.".format(n_folds)) - if n_folds > self.n: - raise ValueError( - ("Cannot have number of folds n_folds={0} greater" - " than the number of samples: {1}.").format(n_folds, n)) - - if not isinstance(shuffle, bool): - raise TypeError("shuffle must be True or False;" - " got {0}".format(shuffle)) - self.shuffle = shuffle - self.random_state = random_state - - -class KFold(_BaseKFold): - """K-Folds cross validation iterator. - - Provides train/test indices to split data in train test sets. Split - dataset into k consecutive folds (without shuffling). - - Each fold is then used a validation set once while the k - 1 remaining - fold form the training set. - - Parameters - ---------- - n : int - Total number of elements. - - n_folds : int, default=3 - Number of folds. Must be at least 2. - - shuffle : boolean, optional - Whether to shuffle the data before splitting into batches. - - random_state : None, int or RandomState - Pseudo-random number generator state used for random - sampling. If None, use default numpy RNG for shuffling - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) - >>> y = np.array([1, 2, 3, 4]) - >>> kf = cross_validation.KFold(4, n_folds=2) - >>> len(kf) - 2 - >>> print(kf) # doctest: +NORMALIZE_WHITESPACE - sklearn.cross_validation.KFold(n=4, n_folds=2, shuffle=False, - random_state=None) - >>> for train_index, test_index in kf: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [2 3] TEST: [0 1] - TRAIN: [0 1] TEST: [2 3] - - Notes - ----- - The first n % n_folds folds have size n // n_folds + 1, other folds have - size n // n_folds. - - See also - -------- - StratifiedKFold: take label information into account to avoid building - folds with imbalanced class distributions (for binary or multiclass - classification tasks). - """ - - def __init__(self, n, n_folds=3, indices=None, shuffle=False, - random_state=None): - super(KFold, self).__init__(n, n_folds, indices, shuffle, random_state) - self.idxs = np.arange(n) - if shuffle: - rng = check_random_state(self.random_state) - rng.shuffle(self.idxs) - - def _iter_test_indices(self): - n = self.n - n_folds = self.n_folds - fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int) - fold_sizes[:n % n_folds] += 1 - current = 0 - for fold_size in fold_sizes: - start, stop = current, current + fold_size - yield self.idxs[start:stop] - current = stop - - def __repr__(self): - return '%s.%s(n=%i, n_folds=%i, shuffle=%s, random_state=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.n, - self.n_folds, - self.shuffle, - self.random_state, - ) - - def __len__(self): - return self.n_folds - - -class StratifiedKFold(_BaseKFold): - """Stratified K-Folds cross validation iterator - - Provides train/test indices to split data in train test sets. - - This cross-validation object is a variation of KFold that - returns stratified folds. The folds are made by preserving - the percentage of samples for each class. - - Parameters - ---------- - y : array-like, [n_samples] - Samples to split in K folds. - - n_folds : int, default=3 - Number of folds. Must be at least 2. - - shuffle : boolean, optional - Whether to shuffle each stratification of the data before splitting - into batches. - - random_state : None, int or RandomState - Pseudo-random number generator state used for random - sampling. If None, use default numpy RNG for shuffling - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) - >>> y = np.array([0, 0, 1, 1]) - >>> skf = cross_validation.StratifiedKFold(y, n_folds=2) - >>> len(skf) - 2 - >>> print(skf) # doctest: +NORMALIZE_WHITESPACE - sklearn.cross_validation.StratifiedKFold(labels=[0 0 1 1], n_folds=2, - shuffle=False, random_state=None) - >>> for train_index, test_index in skf: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [1 3] TEST: [0 2] - TRAIN: [0 2] TEST: [1 3] - - Notes - ----- - All the folds have size trunc(n_samples / n_folds), the last one has the - complementary. - - """ - - def __init__(self, y, n_folds=3, indices=None, shuffle=False, - random_state=None): - super(StratifiedKFold, self).__init__( - len(y), n_folds, indices, shuffle, random_state) - y = np.asarray(y) - n_samples = y.shape[0] - unique_labels, y_inversed = np.unique(y, return_inverse=True) - label_counts = np.bincount(y_inversed) - min_labels = np.min(label_counts) - if self.n_folds > min_labels: - warnings.warn(("The least populated class in y has only %d" - " members, which is too few. The minimum" - " number of labels for any class cannot" - " be less than n_folds=%d." - % (min_labels, self.n_folds)), Warning) - - # don't want to use the same seed in each label's shuffle - if self.shuffle: - rng = check_random_state(self.random_state) - else: - rng = self.random_state - - # pre-assign each sample to a test fold index using individual KFold - # splitting strategies for each label so as to respect the - # balance of labels - per_label_cvs = [ - KFold(max(c, self.n_folds), self.n_folds, shuffle=self.shuffle, - random_state=rng) for c in label_counts] - test_folds = np.zeros(n_samples, dtype=np.int) - for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)): - for label, (_, test_split) in zip(unique_labels, per_label_splits): - label_test_folds = test_folds[y == label] - # the test split can be too big because we used - # KFold(max(c, self.n_folds), self.n_folds) instead of - # KFold(c, self.n_folds) to make it possible to not crash even - # if the data is not 100% stratifiable for all the labels - # (we use a warning instead of raising an exception) - # If this is the case, let's trim it: - test_split = test_split[test_split < len(label_test_folds)] - label_test_folds[test_split] = test_fold_idx - test_folds[y == label] = label_test_folds - - self.test_folds = test_folds - self.y = y - - def _iter_test_masks(self): - for i in range(self.n_folds): - yield self.test_folds == i - - def __repr__(self): - return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.y, - self.n_folds, - self.shuffle, - self.random_state, - ) - - def __len__(self): - return self.n_folds - - -class LeaveOneLabelOut(_PartitionIterator): - """Leave-One-Label_Out cross-validation iterator - - Provides train/test indices to split data according to a third-party - provided label. This label information can be used to encode arbitrary - domain specific stratifications of the samples as integers. - - For instance the labels could be the year of collection of the samples - and thus allow for cross-validation against time-based splits. - - Parameters - ---------- - labels : array-like of int with shape (n_samples,) - Arbitrary domain-specific stratification of the data to be used - to draw the splits. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) - >>> y = np.array([1, 2, 1, 2]) - >>> labels = np.array([1, 1, 2, 2]) - >>> lol = cross_validation.LeaveOneLabelOut(labels) - >>> len(lol) - 2 - >>> print(lol) - sklearn.cross_validation.LeaveOneLabelOut(labels=[1 1 2 2]) - >>> for train_index, test_index in lol: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) - TRAIN: [2 3] TEST: [0 1] - [[5 6] - [7 8]] [[1 2] - [3 4]] [1 2] [1 2] - TRAIN: [0 1] TEST: [2 3] - [[1 2] - [3 4]] [[5 6] - [7 8]] [1 2] [1 2] - - """ - - def __init__(self, labels, indices=None): - super(LeaveOneLabelOut, self).__init__(len(labels), indices) - # We make a copy of labels to avoid side-effects during iteration - self.labels = np.array(labels, copy=True) - self.unique_labels = np.unique(labels) - self.n_unique_labels = len(self.unique_labels) - - def _iter_test_masks(self): - for i in self.unique_labels: - yield self.labels == i - - def __repr__(self): - return '%s.%s(labels=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.labels, - ) - - def __len__(self): - return self.n_unique_labels - - -class LeavePLabelOut(_PartitionIterator): - """Leave-P-Label_Out cross-validation iterator - - Provides train/test indices to split data according to a third-party - provided label. This label information can be used to encode arbitrary - domain specific stratifications of the samples as integers. - - For instance the labels could be the year of collection of the samples - and thus allow for cross-validation against time-based splits. - - The difference between LeavePLabelOut and LeaveOneLabelOut is that - the former builds the test sets with all the samples assigned to - ``p`` different values of the labels while the latter uses samples - all assigned the same labels. - - Parameters - ---------- - labels : array-like of int with shape (n_samples,) - Arbitrary domain-specific stratification of the data to be used - to draw the splits. - - p : int - Number of samples to leave out in the test split. - - Examples - -------- - >>> from sklearn import cross_validation - >>> X = np.array([[1, 2], [3, 4], [5, 6]]) - >>> y = np.array([1, 2, 1]) - >>> labels = np.array([1, 2, 3]) - >>> lpl = cross_validation.LeavePLabelOut(labels, p=2) - >>> len(lpl) - 3 - >>> print(lpl) - sklearn.cross_validation.LeavePLabelOut(labels=[1 2 3], p=2) - >>> for train_index, test_index in lpl: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - ... print(X_train, X_test, y_train, y_test) - TRAIN: [2] TEST: [0 1] - [[5 6]] [[1 2] - [3 4]] [1] [1 2] - TRAIN: [1] TEST: [0 2] - [[3 4]] [[1 2] - [5 6]] [2] [1 1] - TRAIN: [0] TEST: [1 2] - [[1 2]] [[3 4] - [5 6]] [1] [2 1] - """ - - def __init__(self, labels, p, indices=None): - # We make a copy of labels to avoid side-effects during iteration - super(LeavePLabelOut, self).__init__(len(labels), indices) - self.labels = np.array(labels, copy=True) - self.unique_labels = np.unique(labels) - self.n_unique_labels = len(self.unique_labels) - self.p = p - - def _iter_test_masks(self): - comb = combinations(range(self.n_unique_labels), self.p) - for idx in comb: - test_index = self._empty_mask() - idx = np.array(idx) - for l in self.unique_labels[idx]: - test_index[self.labels == l] = True - yield test_index - - def __repr__(self): - return '%s.%s(labels=%s, p=%s)' % ( - self.__class__.__module__, - self.__class__.__name__, - self.labels, - self.p, - ) - - def __len__(self): - return int(factorial(self.n_unique_labels) / - factorial(self.n_unique_labels - self.p) / - factorial(self.p)) - +#TODO: move Boostrap somewhere else and import it from here +#TODO: issue a DeprecationWarning when this module is imported class Bootstrap(object): """Random sampling with replacement cross-validation iterator @@ -745,817 +180,6 @@ def __len__(self): return self.n_iter -class BaseShuffleSplit(with_metaclass(ABCMeta)): - """Base class for ShuffleSplit and StratifiedShuffleSplit""" - - def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, - indices=None, random_state=None, n_iterations=None): - if indices is None: - indices = True - else: - warnings.warn("The indices parameter is deprecated and will be " - "removed (assumed True) in 0.17", DeprecationWarning) - self.n = n - self.n_iter = n_iter - if n_iterations is not None: # pragma: no cover - warnings.warn("n_iterations was renamed to n_iter for consistency " - " and will be removed in 0.16.") - self.n_iter = n_iterations - self.test_size = test_size - self.train_size = train_size - self.random_state = random_state - self._indices = indices - self.n_train, self.n_test = _validate_shuffle_split(n, - test_size, - train_size) - - @property - def indices(self): - warnings.warn("The indices attribute is deprecated and will be " - "removed (assumed True) in 0.17", DeprecationWarning, - stacklevel=1) - return self._indices - - def __iter__(self): - if self._indices: - for train, test in self._iter_indices(): - yield train, test - return - for train, test in self._iter_indices(): - train_m = np.zeros(self.n, dtype=bool) - test_m = np.zeros(self.n, dtype=bool) - train_m[train] = True - test_m[test] = True - yield train_m, test_m - - @abstractmethod - def _iter_indices(self): - """Generate (train, test) indices""" - - -class ShuffleSplit(BaseShuffleSplit): - """Random permutation cross-validation iterator. - - Yields indices to split data into training and test sets. - - Note: contrary to other cross-validation strategies, random splits - do not guarantee that all folds will be different, although this is - still very likely for sizeable datasets. - - Parameters - ---------- - n : int - Total number of elements in the dataset. - - n_iter : int (default 10) - Number of re-shuffling & splitting iterations. - - test_size : float (default 0.1), int, or None - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. - - train_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. - - random_state : int or RandomState - Pseudo-random number generator state used for random sampling. - - Examples - -------- - >>> from sklearn import cross_validation - >>> rs = cross_validation.ShuffleSplit(4, n_iter=3, - ... test_size=.25, random_state=0) - >>> len(rs) - 3 - >>> print(rs) - ... # doctest: +ELLIPSIS - ShuffleSplit(4, n_iter=3, test_size=0.25, ...) - >>> for train_index, test_index in rs: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... - TRAIN: [3 1 0] TEST: [2] - TRAIN: [2 1 3] TEST: [0] - TRAIN: [0 2 1] TEST: [3] - - >>> rs = cross_validation.ShuffleSplit(4, n_iter=3, - ... train_size=0.5, test_size=.25, random_state=0) - >>> for train_index, test_index in rs: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... - TRAIN: [3 1] TEST: [2] - TRAIN: [2 1] TEST: [0] - TRAIN: [0 2] TEST: [3] - - See also - -------- - Bootstrap: cross-validation using re-sampling with replacement. - """ - - def _iter_indices(self): - rng = check_random_state(self.random_state) - for i in range(self.n_iter): - # random partition - permutation = rng.permutation(self.n) - ind_test = permutation[:self.n_test] - ind_train = permutation[self.n_test:self.n_test + self.n_train] - yield ind_train, ind_test - - def __repr__(self): - return ('%s(%d, n_iter=%d, test_size=%s, ' - 'random_state=%s)' % ( - self.__class__.__name__, - self.n, - self.n_iter, - str(self.test_size), - self.random_state, - )) - - def __len__(self): - return self.n_iter - - -def _validate_shuffle_split(n, test_size, train_size): - if test_size is None and train_size is None: - raise ValueError( - 'test_size and train_size can not both be None') - - if test_size is not None: - if np.asarray(test_size).dtype.kind == 'f': - if test_size >= 1.: - raise ValueError( - 'test_size=%f should be smaller ' - 'than 1.0 or be an integer' % test_size) - elif np.asarray(test_size).dtype.kind == 'i': - if test_size >= n: - raise ValueError( - 'test_size=%d should be smaller ' - 'than the number of samples %d' % (test_size, n)) - else: - raise ValueError("Invalid value for test_size: %r" % test_size) - - if train_size is not None: - if np.asarray(train_size).dtype.kind == 'f': - if train_size >= 1.: - raise ValueError("train_size=%f should be smaller " - "than 1.0 or be an integer" % train_size) - elif np.asarray(test_size).dtype.kind == 'f' and \ - train_size + test_size > 1.: - raise ValueError('The sum of test_size and train_size = %f, ' - 'should be smaller than 1.0. Reduce ' - 'test_size and/or train_size.' % - (train_size + test_size)) - elif np.asarray(train_size).dtype.kind == 'i': - if train_size >= n: - raise ValueError("train_size=%d should be smaller " - "than the number of samples %d" % - (train_size, n)) - else: - raise ValueError("Invalid value for train_size: %r" % train_size) - - if np.asarray(test_size).dtype.kind == 'f': - n_test = ceil(test_size * n) - elif np.asarray(test_size).dtype.kind == 'i': - n_test = float(test_size) - - if train_size is None: - n_train = n - n_test - else: - if np.asarray(train_size).dtype.kind == 'f': - n_train = floor(train_size * n) - else: - n_train = float(train_size) - - if test_size is None: - n_test = n - n_train - - if n_train + n_test > n: - raise ValueError('The sum of train_size and test_size = %d, ' - 'should be smaller than the number of ' - 'samples %d. Reduce test_size and/or ' - 'train_size.' % (n_train + n_test, n)) - - return int(n_train), int(n_test) - - -class StratifiedShuffleSplit(BaseShuffleSplit): - """Stratified ShuffleSplit cross validation iterator - - Provides train/test indices to split data in train test sets. - - This cross-validation object is a merge of StratifiedKFold and - ShuffleSplit, which returns stratified randomized folds. The folds - are made by preserving the percentage of samples for each class. - - Note: like the ShuffleSplit strategy, stratified random splits - do not guarantee that all folds will be different, although this is - still very likely for sizeable datasets. - - Parameters - ---------- - y : array, [n_samples] - Labels of samples. - - n_iter : int (default 10) - Number of re-shuffling & splitting iterations. - - test_size : float (default 0.1), int, or None - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. - - train_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. - - random_state : int or RandomState - Pseudo-random number generator state used for random sampling. - - Examples - -------- - >>> from sklearn.cross_validation import StratifiedShuffleSplit - >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) - >>> y = np.array([0, 0, 1, 1]) - >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0) - >>> len(sss) - 3 - >>> print(sss) # doctest: +ELLIPSIS - StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...) - >>> for train_index, test_index in sss: - ... print("TRAIN:", train_index, "TEST:", test_index) - ... X_train, X_test = X[train_index], X[test_index] - ... y_train, y_test = y[train_index], y[test_index] - TRAIN: [1 2] TEST: [3 0] - TRAIN: [0 2] TEST: [1 3] - TRAIN: [0 2] TEST: [3 1] - """ - - def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, - indices=None, random_state=None, n_iterations=None): - - super(StratifiedShuffleSplit, self).__init__( - len(y), n_iter, test_size, train_size, indices, random_state, - n_iterations) - self.y = np.array(y) - self.classes, self.y_indices = np.unique(y, return_inverse=True) - n_cls = self.classes.shape[0] - - if np.min(np.bincount(self.y_indices)) < 2: - raise ValueError("The least populated class in y has only 1" - " member, which is too few. The minimum" - " number of labels for any class cannot" - " be less than 2.") - - if self.n_train < n_cls: - raise ValueError('The train_size = %d should be greater or ' - 'equal to the number of classes = %d' % - (self.n_train, n_cls)) - if self.n_test < n_cls: - raise ValueError('The test_size = %d should be greater or ' - 'equal to the number of classes = %d' % - (self.n_test, n_cls)) - - def _iter_indices(self): - rng = check_random_state(self.random_state) - cls_count = np.bincount(self.y_indices) - p_i = cls_count / float(self.n) - n_i = np.round(self.n_train * p_i).astype(int) - t_i = np.minimum(cls_count - n_i, - np.round(self.n_test * p_i).astype(int)) - - for n in range(self.n_iter): - train = [] - test = [] - - for i, cls in enumerate(self.classes): - permutation = rng.permutation(cls_count[i]) - cls_i = np.where((self.y == cls))[0][permutation] - - train.extend(cls_i[:n_i[i]]) - test.extend(cls_i[n_i[i]:n_i[i] + t_i[i]]) - - # Because of rounding issues (as n_train and n_test are not - # dividers of the number of elements per class), we may end - # up here with less samples in train and test than asked for. - if len(train) < self.n_train or len(test) < self.n_test: - # We complete by affecting randomly the missing indexes - missing_idx = np.where(np.bincount(train + test, - minlength=len(self.y)) == 0, - )[0] - missing_idx = rng.permutation(missing_idx) - train.extend(missing_idx[:(self.n_train - len(train))]) - test.extend(missing_idx[-(self.n_test - len(test)):]) - - train = rng.permutation(train) - test = rng.permutation(test) - - yield train, test - - def __repr__(self): - return ('%s(labels=%s, n_iter=%d, test_size=%s, ' - 'random_state=%s)' % ( - self.__class__.__name__, - self.y, - self.n_iter, - str(self.test_size), - self.random_state, - )) - - def __len__(self): - return self.n_iter - - ############################################################################## -def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, - verbose=0, fit_params=None, score_func=None, - pre_dispatch='2*n_jobs'): - """Evaluate a score by cross-validation - - Parameters - ---------- - estimator : estimator object implementing 'fit' - The object to use to fit the data. - - X : array-like - The data to fit. Can be, for example a list, or an array at least 2d. - - y : array-like, optional, default: None - The target variable to try to predict in the case of - supervised learning. - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - cv : cross-validation generator, optional, default: None - A cross-validation generator. If None, a 3-fold cross - validation is used or 3-fold stratified cross-validation - when y is supplied and estimator is a classifier. - - n_jobs : integer, optional - The number of CPUs to use to do the computation. -1 means - 'all CPUs'. - - verbose : integer, optional - The verbosity level. - - fit_params : dict, optional - Parameters to pass to the fit method of the estimator. - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - Returns - ------- - scores : array of float, shape=(len(list(cv)),) - Array of scores of the estimator for each run of the cross validation. - """ - X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True, - allow_nans=True, allow_nd=True) - if y is not None: - y = np.asarray(y) - - cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = check_scoring(estimator, score_func=score_func, scoring=scoring) - # We clone the estimator to make sure that all the folds are - # independent, and that it is pickle-able. - parallel = Parallel(n_jobs=n_jobs, verbose=verbose, - pre_dispatch=pre_dispatch) - scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, - train, test, verbose, None, - fit_params) - for train, test in cv) - return np.array(scores)[:, 0] - - -def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, - fit_params, return_train_score=False, - return_parameters=False): - """Fit estimator and compute scores for a given dataset split. - - Parameters - ---------- - estimator : estimator object implementing 'fit' - The object to use to fit the data. - - X : array-like of shape at least 2D - The data to fit. - - y : array-like, optional, default: None - The target variable to try to predict in the case of - supervised learning. - - scoring : callable - A scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - train : array-like, shape = (n_train_samples,) - Indices of training samples. - - test : array-like, shape = (n_test_samples,) - Indices of test samples. - - verbose : integer - The verbosity level. - - parameters : dict or None - Parameters to be set on the estimator. - - fit_params : dict or None - Parameters that will be passed to ``estimator.fit``. - - return_train_score : boolean, optional, default: False - Compute and return score on training set. - - return_parameters : boolean, optional, default: False - Return parameters that has been used for the estimator. - - Returns - ------- - train_score : float, optional - Score on training set, returned only if `return_train_score` is `True`. - - test_score : float - Score on test set. - - n_test_samples : int - Number of test samples. - - scoring_time : float - Time spent for fitting and scoring in seconds. - - parameters : dict or None, optional - The parameters that have been evaluated. - """ - if verbose > 1: - if parameters is None: - msg = "no parameters to be set" - else: - msg = '%s' % (', '.join('%s=%s' % (k, v) - for k, v in parameters.items())) - print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) - - # Adjust lenght of sample weights - n_samples = _num_samples(X) - fit_params = fit_params if fit_params is not None else {} - fit_params = dict([(k, np.asarray(v)[train] - if hasattr(v, '__len__') and len(v) == n_samples else v) - for k, v in fit_params.items()]) - - if parameters is not None: - estimator.set_params(**parameters) - - start_time = time.time() - - X_train, y_train = _safe_split(estimator, X, y, train) - X_test, y_test = _safe_split(estimator, X, y, test, train) - if y_train is None: - estimator.fit(X_train, **fit_params) - else: - estimator.fit(X_train, y_train, **fit_params) - test_score = _score(estimator, X_test, y_test, scorer) - if return_train_score: - train_score = _score(estimator, X_train, y_train, scorer) - - scoring_time = time.time() - start_time - - if verbose > 2: - msg += ", score=%f" % test_score - if verbose > 1: - end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) - print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) - - ret = [train_score] if return_train_score else [] - ret.extend([test_score, _num_samples(X_test), scoring_time]) - if return_parameters: - ret.append(parameters) - return ret - - -def _safe_split(estimator, X, y, indices, train_indices=None): - """Create subset of dataset and properly handle kernels.""" - if hasattr(estimator, 'kernel') and callable(estimator.kernel): - # cannot compute the kernel values with custom function - raise ValueError("Cannot use a custom kernel function. " - "Precompute the kernel matrix instead.") - - if not hasattr(X, "shape"): - if getattr(estimator, "_pairwise", False): - raise ValueError("Precomputed kernels or affinity matrices have " - "to be passed as arrays or sparse matrices.") - X_subset = [X[idx] for idx in indices] - else: - if getattr(estimator, "_pairwise", False): - # X is a precomputed square kernel matrix - if X.shape[0] != X.shape[1]: - raise ValueError("X should be a square kernel matrix") - if train_indices is None: - X_subset = X[np.ix_(indices, indices)] - else: - X_subset = X[np.ix_(indices, train_indices)] - else: - X_subset = X[safe_mask(X, indices)] - - if y is not None: - y_subset = y[safe_mask(y, indices)] - else: - y_subset = None - - return X_subset, y_subset - - -def _score(estimator, X_test, y_test, scorer): - """Compute the score of an estimator on a given test set.""" - if y_test is None: - score = scorer(estimator, X_test) - else: - score = scorer(estimator, X_test, y_test) - if not isinstance(score, numbers.Number): - raise ValueError("scoring must return a number, got %s (%s) instead." - % (str(score), type(score))) - return score - - -def _permutation_test_score(estimator, X, y, cv, scorer): - """Auxiliary function for permutation_test_score""" - avg_score = [] - for train, test in cv: - estimator.fit(X[train], y[train]) - avg_score.append(scorer(estimator, X[test], y[test])) - return np.mean(avg_score) - - -def _shuffle(y, labels, random_state): - """Return a shuffled copy of y eventually shuffle among same labels.""" - if labels is None: - ind = random_state.permutation(len(y)) - else: - ind = np.arange(len(labels)) - for label in np.unique(labels): - this_mask = (labels == label) - ind[this_mask] = random_state.permutation(ind[this_mask]) - return y[ind] - - -def check_cv(cv, X=None, y=None, classifier=False): - """Input checker utility for building a CV in a user friendly way. - - Parameters - ---------- - cv : int, a cv generator instance, or None - The input specifying which cv generator to use. It can be an - integer, in which case it is the number of folds in a KFold, - None, in which case 3 fold is used, or another object, that - will then be used as a cv generator. - - X : array-like - The data the cross-val object will be applied on. - - y : array-like - The target variable for a supervised learning problem. - - classifier : boolean optional - Whether the task is a classification task, in which case - stratified KFold will be used. - - Returns - ------- - checked_cv: a cross-validation generator instance. - The return value is guaranteed to be a cv generator instance, whatever - the input type. - """ - return _check_cv(cv, X=X, y=y, classifier=classifier, warn_mask=True) - - -def _check_cv(cv, X=None, y=None, classifier=False, warn_mask=False): - # This exists for internal use while indices is being deprecated. - is_sparse = sp.issparse(X) - needs_indices = is_sparse or not hasattr(X, "shape") - if cv is None: - cv = 3 - if isinstance(cv, numbers.Integral): - if warn_mask and not needs_indices: - warnings.warn('check_cv will return indices instead of boolean ' - 'masks from 0.17', DeprecationWarning) - else: - needs_indices = None - if classifier: - cv = StratifiedKFold(y, cv, indices=needs_indices) - else: - if not is_sparse: - n_samples = len(X) - else: - n_samples = X.shape[0] - cv = KFold(n_samples, cv, indices=needs_indices) - if needs_indices and not getattr(cv, "_indices", True): - raise ValueError("Sparse data and lists require indices-based cross" - " validation generator, got: %r", cv) - return cv - - -def permutation_test_score(estimator, X, y, score_func=None, cv=None, - n_permutations=100, n_jobs=1, labels=None, - random_state=0, verbose=0, scoring=None): - """Evaluate the significance of a cross-validated score with permutations - - Parameters - ---------- - estimator : estimator object implementing 'fit' - The object to use to fit the data. - - X : array-like of shape at least 2D - The data to fit. - - y : array-like - The target variable to try to predict in the case of - supervised learning. - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - cv : integer or cross-validation generator, optional - If an integer is passed, it is the number of fold (default 3). - Specific cross-validation objects can be passed, see - sklearn.cross_validation module for the list of possible objects. - - n_permutations : integer, optional - Number of times to permute ``y``. - - n_jobs : integer, optional - The number of CPUs to use to do the computation. -1 means - 'all CPUs'. - - labels : array-like of shape [n_samples] (optional) - Labels constrain the permutation among groups of samples with - a same label. - - random_state : RandomState or an int seed (0 by default) - A random number generator instance to define the state of the - random permutations generator. - - verbose : integer, optional - The verbosity level. - - Returns - ------- - score : float - The true score without permuting targets. - - permutation_scores : array, shape = [n_permutations] - The scores obtained for each permutations. - - pvalue : float - The returned value equals p-value if `score_func` returns bigger - numbers for better scores (e.g., accuracy_score). If `score_func` is - rather a loss function (i.e. when lower is better such as with - `mean_squared_error`) then this is actually the complement of the - p-value: 1 - p-value. - - Notes - ----- - This function implements Test 1 in: - - Ojala and Garriga. Permutation Tests for Studying Classifier - Performance. The Journal of Machine Learning Research (2010) - vol. 11 - - """ - X, y = check_arrays(X, y, sparse_format='csr', allow_nans=True) - cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = check_scoring(estimator, scoring=scoring, score_func=score_func) - random_state = check_random_state(random_state) - - # We clone the estimator to make sure that all the folds are - # independent, and that it is pickle-able. - score = _permutation_test_score(clone(estimator), X, y, cv, scorer) - permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( - delayed(_permutation_test_score)( - clone(estimator), X, _shuffle(y, labels, random_state), cv, - scorer) - for _ in range(n_permutations)) - permutation_scores = np.array(permutation_scores) - pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1) - return score, permutation_scores, pvalue - - -permutation_test_score.__test__ = False # to avoid a pb with nosetests - - -def train_test_split(*arrays, **options): - """Split arrays or matrices into random train and test subsets - - Quick utility that wraps calls to ``check_arrays`` and - ``next(iter(ShuffleSplit(n_samples)))`` and application to input - data into a single call for splitting (and optionally subsampling) - data in a oneliner. - - Parameters - ---------- - *arrays : sequence of arrays or scipy.sparse matrices with same shape[0] - Python lists or tuples occurring in arrays are converted to 1D numpy - arrays. - - test_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the test split. If - int, represents the absolute number of test samples. If None, - the value is automatically set to the complement of the train size. - If train size is also None, test size is set to 0.25. - - train_size : float, int, or None (default is None) - If float, should be between 0.0 and 1.0 and represent the - proportion of the dataset to include in the train split. If - int, represents the absolute number of train samples. If None, - the value is automatically set to the complement of the test size. - - random_state : int or RandomState - Pseudo-random number generator state used for random sampling. - - dtype : a numpy dtype instance, None by default - Enforce a specific dtype. - - Returns - ------- - splitting : list of arrays, length=2 * len(arrays) - List containing train-test split of input array. - - Examples - -------- - >>> import numpy as np - >>> from sklearn.cross_validation import train_test_split - >>> a, b = np.arange(10).reshape((5, 2)), range(5) - >>> a - array([[0, 1], - [2, 3], - [4, 5], - [6, 7], - [8, 9]]) - >>> list(b) - [0, 1, 2, 3, 4] - - >>> a_train, a_test, b_train, b_test = train_test_split( - ... a, b, test_size=0.33, random_state=42) - ... - >>> a_train - array([[4, 5], - [0, 1], - [6, 7]]) - >>> b_train - array([2, 0, 3]) - >>> a_test - array([[2, 3], - [8, 9]]) - >>> b_test - array([1, 4]) - - """ - n_arrays = len(arrays) - if n_arrays == 0: - raise ValueError("At least one array required as input") - - test_size = options.pop('test_size', None) - train_size = options.pop('train_size', None) - random_state = options.pop('random_state', None) - options['sparse_format'] = 'csr' - options['allow_nans'] = True - - if test_size is None and train_size is None: - test_size = 0.25 - - arrays = check_arrays(*arrays, **options) - n_samples = arrays[0].shape[0] - cv = ShuffleSplit(n_samples, test_size=test_size, - train_size=train_size, - random_state=random_state) - - train, test = next(iter(cv)) - return list(chain.from_iterable((a[train], a[test]) for a in arrays)) - - -train_test_split.__test__ = False # to avoid a pb with nosetests diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index be5cdaec5ecdc..b07a7c32350ef 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -12,8 +12,8 @@ from ..base import MetaEstimatorMixin from ..base import clone from ..base import is_classifier -from ..cross_validation import _check_cv as check_cv -from ..cross_validation import _safe_split, _score +from ..model_selection.partition import _check_cv as check_cv +from ..model_selection.validate import _safe_split, _score from .base import SelectorMixin from ..metrics.scorer import check_scoring diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 753803509b230..bc425750d5c53 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -10,741 +10,12 @@ # Olivier Grisel # License: BSD 3 clause -from abc import ABCMeta, abstractmethod -from collections import Mapping, namedtuple, Sized -from functools import partial, reduce -from itertools import product -import operator - -import numpy as np - -from .base import BaseEstimator, is_classifier, clone -from .base import MetaEstimatorMixin -from .cross_validation import _check_cv as check_cv -from .cross_validation import _fit_and_score -from .externals.joblib import Parallel, delayed -from .externals import six -from .utils import check_random_state -from .utils.validation import _num_samples, check_arrays -from .metrics.scorer import check_scoring +#TODO: add deprecation warning to this module +from .model_selection.search import GridSearchCV, RandomizedSearchCV +from .model_selection.utils import ParameterGrid, ParameterSampler, \ + fit_grid_point __all__ = ['GridSearchCV', 'ParameterGrid', 'fit_grid_point', 'ParameterSampler', 'RandomizedSearchCV'] - -class ParameterGrid(object): - """Grid of parameters with a discrete number of values for each. - - Can be used to iterate over parameter value combinations with the - Python built-in function iter. - - Parameters - ---------- - param_grid : dict of string to sequence, or sequence of such - The parameter grid to explore, as a dictionary mapping estimator - parameters to sequences of allowed values. - - An empty dict signifies default parameters. - - A sequence of dicts signifies a sequence of grids to search, and is - useful to avoid exploring parameter combinations that make no sense - or have no effect. See the examples below. - - Examples - -------- - >>> from sklearn.grid_search import ParameterGrid - >>> param_grid = {'a': [1, 2], 'b': [True, False]} - >>> list(ParameterGrid(param_grid)) == ( - ... [{'a': 1, 'b': True}, {'a': 1, 'b': False}, - ... {'a': 2, 'b': True}, {'a': 2, 'b': False}]) - True - - >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}] - >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'}, - ... {'kernel': 'rbf', 'gamma': 1}, - ... {'kernel': 'rbf', 'gamma': 10}] - True - - See also - -------- - :class:`GridSearchCV`: - uses ``ParameterGrid`` to perform a full parallelized parameter search. - """ - - def __init__(self, param_grid): - if isinstance(param_grid, Mapping): - # wrap dictionary in a singleton list to support either dict - # or list of dicts - param_grid = [param_grid] - self.param_grid = param_grid - - def __iter__(self): - """Iterate over the points in the grid. - - Returns - ------- - params : iterator over dict of string to any - Yields dictionaries mapping each estimator parameter to one of its - allowed values. - """ - for p in self.param_grid: - # Always sort the keys of a dictionary, for reproducibility - items = sorted(p.items()) - if not items: - yield {} - else: - keys, values = zip(*items) - for v in product(*values): - params = dict(zip(keys, v)) - yield params - - def __len__(self): - """Number of points on the grid.""" - # Product function that can handle iterables (np.product can't). - product = partial(reduce, operator.mul) - return sum(product(len(v) for v in p.values()) if p else 1 - for p in self.param_grid) - - -class ParameterSampler(object): - """Generator on parameters sampled from given distributions. - - Non-deterministic iterable over random candidate combinations for hyper- - parameter search. - - Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept - a custom RNG instance and always use the singleton RNG from - ``numpy.random``. Hence setting ``random_state`` will not guarantee a - deterministic iteration whenever ``scipy.stats`` distributions are used to - define the parameter search space. - - Parameters - ---------- - param_distributions : dict - Dictionary where the keys are parameters and values - are distributions from which a parameter is to be sampled. - Distributions either have to provide a ``rvs`` function - to sample from them, or can be given as a list of values, - where a uniform distribution is assumed. - - n_iter : integer - Number of parameter settings that are produced. - - random_state : int or RandomState - Pseudo random number generator state used for random uniform sampling - from lists of possible values instead of scipy.stats distributions. - - Returns - ------- - params : dict of string to any - **Yields** dictionaries mapping each estimator parameter to - as sampled value. - - Examples - -------- - >>> from sklearn.grid_search import ParameterSampler - >>> from scipy.stats.distributions import expon - >>> import numpy as np - >>> np.random.seed(0) - >>> param_grid = {'a':[1, 2], 'b': expon()} - >>> param_list = list(ParameterSampler(param_grid, n_iter=4)) - >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items()) - ... for d in param_list] - >>> rounded_list == [{'b': 0.89856, 'a': 1}, - ... {'b': 0.923223, 'a': 1}, - ... {'b': 1.878964, 'a': 2}, - ... {'b': 1.038159, 'a': 2}] - True - """ - def __init__(self, param_distributions, n_iter, random_state=None): - self.param_distributions = param_distributions - self.n_iter = n_iter - self.random_state = random_state - - def __iter__(self): - rnd = check_random_state(self.random_state) - # Always sort the keys of a dictionary, for reproducibility - items = sorted(self.param_distributions.items()) - for _ in range(self.n_iter): - params = dict() - for k, v in items: - if hasattr(v, "rvs"): - params[k] = v.rvs() - else: - params[k] = v[rnd.randint(len(v))] - yield params - - def __len__(self): - """Number of points that will be sampled.""" - return self.n_iter - - -def fit_grid_point(X, y, estimator, parameters, train, test, scorer, - verbose, **fit_params): - """Run fit on one set of parameters. - - Parameters - ---------- - X : array-like, sparse matrix or list - Input data. - - y : array-like or None - Targets for input data. - - estimator : estimator object - This estimator will be cloned and then fitted. - - parameters : dict - Parameters to be set on estimator for this grid point. - - train : ndarray, dtype int or bool - Boolean mask or indices for training set. - - test : ndarray, dtype int or bool - Boolean mask or indices for test set. - - scorer : callable or None. - If provided must be a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - verbose : int - Verbosity level. - - **fit_params : kwargs - Additional parameter passed to the fit function of the estimator. - - - Returns - ------- - score : float - Score of this parameter setting on given training / test split. - - parameters : dict - The parameters that have been evaluated. - - n_samples_test : int - Number of test samples in this split. - """ - score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train, - test, verbose, parameters, - fit_params) - return score, parameters, n_samples_test - - -def _check_param_grid(param_grid): - if hasattr(param_grid, 'items'): - param_grid = [param_grid] - - for p in param_grid: - for v in p.values(): - if isinstance(v, np.ndarray) and v.ndim > 1: - raise ValueError("Parameter array should be one-dimensional.") - - check = [isinstance(v, k) for k in (list, tuple, np.ndarray)] - if not True in check: - raise ValueError("Parameter values should be a list.") - - if len(v) == 0: - raise ValueError("Parameter values should be a non-empty " - "list.") - - -class _CVScoreTuple (namedtuple('_CVScoreTuple', - ('parameters', - 'mean_validation_score', - 'cv_validation_scores'))): - # A raw namedtuple is very memory efficient as it packs the attributes - # in a struct to get rid of the __dict__ of attributes in particular it - # does not copy the string for the keys on each instance. - # By deriving a namedtuple class just to introduce the __repr__ method we - # would also reintroduce the __dict__ on the instance. By telling the - # Python interpreter that this subclass uses static __slots__ instead of - # dynamic attributes. Furthermore we don't need any additional slot in the - # subclass so we set __slots__ to the empty tuple. - __slots__ = () - - def __repr__(self): - """Simple custom repr to summarize the main info""" - return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format( - self.mean_validation_score, - np.std(self.cv_validation_scores), - self.parameters) - - -class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, - MetaEstimatorMixin)): - """Base class for hyper parameter search with cross-validation.""" - - @abstractmethod - def __init__(self, estimator, scoring=None, loss_func=None, - score_func=None, fit_params=None, n_jobs=1, iid=True, - refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'): - - self.scoring = scoring - self.estimator = estimator - self.loss_func = loss_func - self.score_func = score_func - self.n_jobs = n_jobs - self.fit_params = fit_params if fit_params is not None else {} - self.iid = iid - self.refit = refit - self.cv = cv - self.verbose = verbose - self.pre_dispatch = pre_dispatch - - def score(self, X, y=None): - """Returns the score on the given test data and labels, if the search - estimator has been refit. The ``score`` function of the best estimator - is used, or the ``scoring`` parameter where unavailable. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Input data, where n_samples is the number of samples and - n_features is the number of features. - - y : array-like, shape = [n_samples] or [n_samples, n_output], optional - Target relative to X for classification or regression; - None for unsupervised learning. - - Returns - ------- - score : float - - """ - if hasattr(self.best_estimator_, 'score'): - return self.best_estimator_.score(X, y) - if self.scorer_ is None: - raise ValueError("No score function explicitly defined, " - "and the estimator doesn't provide one %s" - % self.best_estimator_) - return self.scorer_(self.best_estimator_, X, y) - - @property - def predict(self): - return self.best_estimator_.predict - - @property - def predict_proba(self): - return self.best_estimator_.predict_proba - - @property - def decision_function(self): - return self.best_estimator_.decision_function - - @property - def transform(self): - return self.best_estimator_.transform - - def _fit(self, X, y, parameter_iterable): - """Actual fitting, performing the search over parameters.""" - - estimator = self.estimator - cv = self.cv - self.scorer_ = check_scoring(self.estimator, scoring=self.scoring, - loss_func=self.loss_func, - score_func=self.score_func) - - n_samples = _num_samples(X) - X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr', - allow_nans=True) - - if y is not None: - if len(y) != n_samples: - raise ValueError('Target variable (y) has a different number ' - 'of samples (%i) than data (X: %i samples)' - % (len(y), n_samples)) - y = np.asarray(y) - cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) - - if self.verbose > 0: - if isinstance(parameter_iterable, Sized): - n_candidates = len(parameter_iterable) - print("Fitting {0} folds for each of {1} candidates, totalling" - " {2} fits".format(len(cv), n_candidates, - n_candidates * len(cv))) - - base_estimator = clone(self.estimator) - - pre_dispatch = self.pre_dispatch - - out = Parallel( - n_jobs=self.n_jobs, verbose=self.verbose, - pre_dispatch=pre_dispatch - )( - delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, - train, test, self.verbose, parameters, - self.fit_params, return_parameters=True) - for parameters in parameter_iterable - for train, test in cv) - - # Out is a list of triplet: score, estimator, n_test_samples - n_fits = len(out) - n_folds = len(cv) - - scores = list() - grid_scores = list() - for grid_start in range(0, n_fits, n_folds): - n_test_samples = 0 - score = 0 - all_scores = [] - for this_score, this_n_test_samples, _, parameters in \ - out[grid_start:grid_start + n_folds]: - all_scores.append(this_score) - if self.iid: - this_score *= this_n_test_samples - n_test_samples += this_n_test_samples - score += this_score - if self.iid: - score /= float(n_test_samples) - else: - score /= float(n_folds) - scores.append((score, parameters)) - # TODO: shall we also store the test_fold_sizes? - grid_scores.append(_CVScoreTuple( - parameters, - score, - np.array(all_scores))) - # Store the computed scores - self.grid_scores_ = grid_scores - - # Find the best parameters by comparing on the mean validation score: - # note that `sorted` is deterministic in the way it breaks ties - best = sorted(grid_scores, key=lambda x: x.mean_validation_score, - reverse=True)[0] - self.best_params_ = best.parameters - self.best_score_ = best.mean_validation_score - - if self.refit: - # fit the best estimator using the entire dataset - # clone first to work around broken estimators - best_estimator = clone(base_estimator).set_params( - **best.parameters) - if y is not None: - best_estimator.fit(X, y, **self.fit_params) - else: - best_estimator.fit(X, **self.fit_params) - self.best_estimator_ = best_estimator - return self - - -class GridSearchCV(BaseSearchCV): - """Exhaustive search over specified parameter values for an estimator. - - Important members are fit, predict. - - GridSearchCV implements a "fit" method and a "predict" method like - any classifier except that the parameters of the classifier - used to predict is optimized by cross-validation. - - Parameters - ---------- - estimator : object type that implements the "fit" and "predict" methods - A object of that type is instantiated for each grid point. - - param_grid : dict or list of dictionaries - Dictionary with parameters names (string) as keys and lists of - parameter settings to try as values, or a list of such - dictionaries, in which case the grids spanned by each dictionary - in the list are explored. This enables searching over any sequence - of parameter settings. - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - fit_params : dict, optional - Parameters to pass to the fit method. - - n_jobs : int, optional - Number of jobs to run in parallel (default 1). - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - iid : boolean, optional - If True, the data is assumed to be identically distributed across - the folds, and the loss minimized is the total loss per sample, - and not the mean loss across the folds. - - cv : integer or cross-validation generator, optional - If an integer is passed, it is the number of folds (default 3). - Specific cross-validation objects can be passed, see - sklearn.cross_validation module for the list of possible objects - - refit : boolean - Refit the best estimator with the entire dataset. - If "False", it is impossible to make predictions using - this GridSearchCV instance after fitting. - - verbose : integer - Controls the verbosity: the higher, the more messages. - - Examples - -------- - >>> from sklearn import svm, grid_search, datasets - >>> iris = datasets.load_iris() - >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} - >>> svr = svm.SVC() - >>> clf = grid_search.GridSearchCV(svr, parameters) - >>> clf.fit(iris.data, iris.target) - ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS - GridSearchCV(cv=None, - estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=..., - degree=..., gamma=..., kernel='rbf', max_iter=-1, - probability=False, random_state=None, shrinking=True, - tol=..., verbose=False), - fit_params={}, iid=..., loss_func=..., n_jobs=1, - param_grid=..., pre_dispatch=..., refit=..., score_func=..., - scoring=..., verbose=...) - - - Attributes - ---------- - `grid_scores_` : list of named tuples - Contains scores for all parameter combinations in param_grid. - Each entry corresponds to one parameter setting. - Each named tuple has the attributes: - - * ``parameters``, a dict of parameter settings - * ``mean_validation_score``, the mean score over the - cross-validation folds - * ``cv_validation_scores``, the list of scores for each fold - - `best_estimator_` : estimator - Estimator that was chosen by the search, i.e. estimator - which gave highest score (or smallest loss if specified) - on the left out data. - - `best_score_` : float - Score of best_estimator on the left out data. - - `best_params_` : dict - Parameter setting that gave the best results on the hold out data. - - `scorer_` : function - Scorer function used on the held out data to choose the best - parameters for the model. - - Notes - ------ - The parameters selected are those that maximize the score of the left out - data, unless an explicit score is passed in which case it is used instead. - - If `n_jobs` was set to a value higher than one, the data is copied for each - point in the grid (and not `n_jobs` times). This is done for efficiency - reasons if individual jobs take very little time, but may raise errors if - the dataset is large and not enough memory is available. A workaround in - this case is to set `pre_dispatch`. Then, the memory is copied only - `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * - n_jobs`. - - See Also - --------- - :class:`ParameterGrid`: - generates all the combinations of a an hyperparameter grid. - - :func:`sklearn.cross_validation.train_test_split`: - utility function to split the data into a development set usable - for fitting a GridSearchCV instance and an evaluation set for - its final evaluation. - - :func:`sklearn.metrics.make_scorer`: - Make a scorer from a performance metric or loss function. - - """ - - def __init__(self, estimator, param_grid, scoring=None, loss_func=None, - score_func=None, fit_params=None, n_jobs=1, iid=True, - refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'): - super(GridSearchCV, self).__init__( - estimator, scoring, loss_func, score_func, fit_params, n_jobs, iid, - refit, cv, verbose, pre_dispatch) - self.param_grid = param_grid - _check_param_grid(param_grid) - - def fit(self, X, y=None): - """Run fit with all sets of parameters. - - Parameters - ---------- - - X : array-like, shape = [n_samples, n_features] - Training vector, where n_samples is the number of samples and - n_features is the number of features. - - y : array-like, shape = [n_samples] or [n_samples, n_output], optional - Target relative to X for classification or regression; - None for unsupervised learning. - - """ - return self._fit(X, y, ParameterGrid(self.param_grid)) - - -class RandomizedSearchCV(BaseSearchCV): - """Randomized search on hyper parameters. - - RandomizedSearchCV implements a "fit" method and a "predict" method like - any classifier except that the parameters of the classifier - used to predict is optimized by cross-validation. - - In contrast to GridSearchCV, not all parameter values are tried out, but - rather a fixed number of parameter settings is sampled from the specified - distributions. The number of parameter settings that are tried is - given by n_iter. - - Parameters - ---------- - estimator : object type that implements the "fit" and "predict" methods - A object of that type is instantiated for each parameter setting. - - param_distributions : dict - Dictionary with parameters names (string) as keys and distributions - or lists of parameters to try. Distributions must provide a ``rvs`` - method for sampling (such as those from scipy.stats.distributions). - If a list is given, it is sampled uniformly. - - n_iter : int, default=10 - Number of parameter settings that are sampled. n_iter trades - off runtime vs quality of the solution. - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - fit_params : dict, optional - Parameters to pass to the fit method. - - n_jobs : int, optional - Number of jobs to run in parallel (default 1). - - pre_dispatch : int, or string, optional - Controls the number of jobs that get dispatched during parallel - execution. Reducing this number can be useful to avoid an - explosion of memory consumption when more jobs get dispatched - than CPUs can process. This parameter can be: - - - None, in which case all the jobs are immediately - created and spawned. Use this for lightweight and - fast-running jobs, to avoid delays due to on-demand - spawning of the jobs - - - An int, giving the exact number of total jobs that are - spawned - - - A string, giving an expression as a function of n_jobs, - as in '2*n_jobs' - - iid : boolean, optional - If True, the data is assumed to be identically distributed across - the folds, and the loss minimized is the total loss per sample, - and not the mean loss across the folds. - - cv : integer or cross-validation generator, optional - If an integer is passed, it is the number of folds (default 3). - Specific cross-validation objects can be passed, see - sklearn.cross_validation module for the list of possible objects - - refit : boolean - Refit the best estimator with the entire dataset. - If "False", it is impossible to make predictions using - this RandomizedSearchCV instance after fitting. - - verbose : integer - Controls the verbosity: the higher, the more messages. - - - Attributes - ---------- - `grid_scores_` : list of named tuples - Contains scores for all parameter combinations in param_grid. - Each entry corresponds to one parameter setting. - Each named tuple has the attributes: - - * ``parameters``, a dict of parameter settings - * ``mean_validation_score``, the mean score over the - cross-validation folds - * ``cv_validation_scores``, the list of scores for each fold - - `best_estimator_` : estimator - Estimator that was chosen by the search, i.e. estimator - which gave highest score (or smallest loss if specified) - on the left out data. - - `best_score_` : float - Score of best_estimator on the left out data. - - `best_params_` : dict - Parameter setting that gave the best results on the hold out data. - - Notes - ----- - The parameters selected are those that maximize the score of the held-out - data, according to the scoring parameter. - - If `n_jobs` was set to a value higher than one, the data is copied for each - parameter setting(and not `n_jobs` times). This is done for efficiency - reasons if individual jobs take very little time, but may raise errors if - the dataset is large and not enough memory is available. A workaround in - this case is to set `pre_dispatch`. Then, the memory is copied only - `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * - n_jobs`. - - See Also - -------- - :class:`GridSearchCV`: - Does exhaustive search over a grid of parameters. - - :class:`ParameterSampler`: - A generator over parameter settins, constructed from - param_distributions. - - """ - - def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, - fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, - verbose=0, pre_dispatch='2*n_jobs', random_state=None): - - self.param_distributions = param_distributions - self.n_iter = n_iter - self.random_state = random_state - super(RandomizedSearchCV, self).__init__( - estimator=estimator, scoring=scoring, fit_params=fit_params, - n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, - pre_dispatch=pre_dispatch) - - def fit(self, X, y=None): - """Run fit on the estimator with randomly drawn parameters. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vector, where n_samples in the number of samples and - n_features is the number of features. - - y : array-like, shape = [n_samples] or [n_samples, n_output], optional - Target relative to X for classification or regression; - None for unsupervised learning. - - """ - sampled_params = ParameterSampler(self.param_distributions, - self.n_iter, - random_state=self.random_state) - return self._fit(X, y, sampled_params) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 9debbe0776287..1ada1d5d3b257 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -4,303 +4,6 @@ # # License: BSD 3 clause -import warnings +from sklearn.model_selection.validate import learning_curve, validation_curve -import numpy as np - -from .base import is_classifier, clone -from .cross_validation import _check_cv -from .externals.joblib import Parallel, delayed -from .cross_validation import _safe_split, _score, _fit_and_score -from .metrics.scorer import check_scoring -from .utils import check_arrays -from .utils.fixes import astype - - -def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), - cv=None, scoring=None, exploit_incremental_learning=False, - n_jobs=1, pre_dispatch="all", verbose=0): - """Learning curve. - - Determines cross-validated training and test scores for different training - set sizes. - - A cross-validation generator splits the whole dataset k times in training - and test data. Subsets of the training set with varying sizes will be used - to train the estimator and a score for each training subset size and the - test set will be computed. Afterwards, the scores will be averaged over - all k runs for each training subset size. - - Parameters - ---------- - estimator : object type that implements the "fit" and "predict" methods - An object of that type which is cloned for each validation. - - X : array-like, shape (n_samples, n_features) - Training vector, where n_samples is the number of samples and - n_features is the number of features. - - y : array-like, shape (n_samples) or (n_samples, n_features), optional - Target relative to X for classification or regression; - None for unsupervised learning. - - train_sizes : array-like, shape (n_ticks,), dtype float or int - Relative or absolute numbers of training examples that will be used to - generate the learning curve. If the dtype is float, it is regarded as a - fraction of the maximum size of the training set (that is determined - by the selected validation method), i.e. it has to be within (0, 1]. - Otherwise it is interpreted as absolute sizes of the training sets. - Note that for classification the number of samples usually have to - be big enough to contain at least one sample from each class. - (default: np.linspace(0.1, 1.0, 5)) - - cv : integer, cross-validation generator, optional - If an integer is passed, it is the number of folds (defaults to 3). - Specific cross-validation objects can be passed, see - sklearn.cross_validation module for the list of possible objects - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - exploit_incremental_learning : boolean, optional, default: False - If the estimator supports incremental learning, this will be - used to speed up fitting for different training set sizes. - - n_jobs : integer, optional - Number of jobs to run in parallel (default 1). - - pre_dispatch : integer or string, optional - Number of predispatched jobs for parallel execution (default is - all). The option can reduce the allocated memory. The string can - be an expression like '2*n_jobs'. - - verbose : integer, optional - Controls the verbosity: the higher, the more messages. - - Returns - ------- - train_sizes_abs : array, shape = (n_unique_ticks,), dtype int - Numbers of training examples that has been used to generate the - learning curve. Note that the number of ticks might be less - than n_ticks because duplicate entries will be removed. - - train_scores : array, shape (n_ticks, n_cv_folds) - Scores on training sets. - - test_scores : array, shape (n_ticks, n_cv_folds) - Scores on test set. - - Notes - ----- - See :ref:`examples/plot_learning_curve.py ` - """ - if exploit_incremental_learning and not hasattr(estimator, "partial_fit"): - raise ValueError("An estimator must support the partial_fit interface " - "to exploit incremental learning") - - X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) - # Make a list since we will be iterating multiple times over the folds - cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) - scorer = check_scoring(estimator, scoring=scoring) - - # HACK as long as boolean indices are allowed in cv generators - if cv[0][0].dtype == bool: - new_cv = [] - for i in range(len(cv)): - new_cv.append((np.nonzero(cv[i][0])[0], np.nonzero(cv[i][1])[0])) - cv = new_cv - - n_max_training_samples = len(cv[0][0]) - # Because the lengths of folds can be significantly different, it is - # not guaranteed that we use all of the available training data when we - # use the first 'n_max_training_samples' samples. - train_sizes_abs = _translate_train_sizes(train_sizes, - n_max_training_samples) - n_unique_ticks = train_sizes_abs.shape[0] - if verbose > 0: - print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) - - parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, - verbose=verbose) - if exploit_incremental_learning: - classes = np.unique(y) if is_classifier(estimator) else None - out = parallel(delayed(_incremental_fit_estimator)( - clone(estimator), X, y, classes, train, test, train_sizes_abs, - scorer, verbose) for train, test in cv) - else: - out = parallel(delayed(_fit_and_score)( - clone(estimator), X, y, scorer, train[:n_train_samples], test, - verbose, parameters=None, fit_params=None, return_train_score=True) - for train, test in cv for n_train_samples in train_sizes_abs) - out = np.array(out)[:, :2] - n_cv_folds = out.shape[0] // n_unique_ticks - out = out.reshape(n_cv_folds, n_unique_ticks, 2) - - out = np.asarray(out).transpose((2, 1, 0)) - - return train_sizes_abs, out[0], out[1] - - -def _translate_train_sizes(train_sizes, n_max_training_samples): - """Determine absolute sizes of training subsets and validate 'train_sizes'. - - Examples: - _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] - _translate_train_sizes([5, 10], 10) -> [5, 10] - - Parameters - ---------- - train_sizes : array-like, shape (n_ticks,), dtype float or int - Numbers of training examples that will be used to generate the - learning curve. If the dtype is float, it is regarded as a - fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. - - n_max_training_samples : int - Maximum number of training samples (upper bound of 'train_sizes'). - - Returns - ------- - train_sizes_abs : array, shape (n_unique_ticks,), dtype int - Numbers of training examples that will be used to generate the - learning curve. Note that the number of ticks might be less - than n_ticks because duplicate entries will be removed. - """ - train_sizes_abs = np.asarray(train_sizes) - n_ticks = train_sizes_abs.shape[0] - n_min_required_samples = np.min(train_sizes_abs) - n_max_required_samples = np.max(train_sizes_abs) - if np.issubdtype(train_sizes_abs.dtype, np.float): - if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: - raise ValueError("train_sizes has been interpreted as fractions " - "of the maximum number of training samples and " - "must be within (0, 1], but is within [%f, %f]." - % (n_min_required_samples, - n_max_required_samples)) - train_sizes_abs = astype(train_sizes_abs * n_max_training_samples, - dtype=np.int, copy=False) - train_sizes_abs = np.clip(train_sizes_abs, 1, - n_max_training_samples) - else: - if (n_min_required_samples <= 0 or - n_max_required_samples > n_max_training_samples): - raise ValueError("train_sizes has been interpreted as absolute " - "numbers of training samples and must be within " - "(0, %d], but is within [%d, %d]." - % (n_max_training_samples, - n_min_required_samples, - n_max_required_samples)) - - train_sizes_abs = np.unique(train_sizes_abs) - if n_ticks > train_sizes_abs.shape[0]: - warnings.warn("Removed duplicate entries from 'train_sizes'. Number " - "of ticks will be less than than the size of " - "'train_sizes' %d instead of %d)." - % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning) - - return train_sizes_abs - - -def _incremental_fit_estimator(estimator, X, y, classes, train, test, - train_sizes, scorer, verbose): - """Train estimator on training subsets incrementally and compute scores.""" - train_scores, test_scores = [], [] - partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) - for n_train_samples, partial_train in partitions: - train_subset = train[:n_train_samples] - X_train, y_train = _safe_split(estimator, X, y, train_subset) - X_partial_train, y_partial_train = _safe_split(estimator, X, y, - partial_train) - X_test, y_test = _safe_split(estimator, X, y, test, train_subset) - if y_partial_train is None: - estimator.partial_fit(X_partial_train, classes=classes) - else: - estimator.partial_fit(X_partial_train, y_partial_train, - classes=classes) - train_scores.append(_score(estimator, X_train, y_train, scorer)) - test_scores.append(_score(estimator, X_test, y_test, scorer)) - return np.array((train_scores, test_scores)).T - - -def validation_curve(estimator, X, y, param_name, param_range, cv=None, - scoring=None, n_jobs=1, pre_dispatch="all", verbose=0): - """Validation curve. - - Determine training and test scores for varying parameter values. - - Compute scores for an estimator with different values of a specified - parameter. This is similar to grid search with one parameter. However, this - will also compute training scores and is merely a utility for plotting the - results. - - Parameters - ---------- - estimator : object type that implements the "fit" and "predict" methods - An object of that type which is cloned for each validation. - - X : array-like, shape (n_samples, n_features) - Training vector, where n_samples is the number of samples and - n_features is the number of features. - - y : array-like, shape (n_samples) or (n_samples, n_features), optional - Target relative to X for classification or regression; - None for unsupervised learning. - - param_name : string - Name of the parameter that will be varied. - - param_range : array-like, shape (n_values,) - The values of the parameter that will be evaluated. - - cv : integer, cross-validation generator, optional - If an integer is passed, it is the number of folds (defaults to 3). - Specific cross-validation objects can be passed, see - sklearn.cross_validation module for the list of possible objects - - scoring : string, callable or None, optional, default: None - A string (see model evaluation documentation) or - a scorer callable object / function with signature - ``scorer(estimator, X, y)``. - - n_jobs : integer, optional - Number of jobs to run in parallel (default 1). - - pre_dispatch : integer or string, optional - Number of predispatched jobs for parallel execution (default is - all). The option can reduce the allocated memory. The string can - be an expression like '2*n_jobs'. - - verbose : integer, optional - Controls the verbosity: the higher, the more messages. - - Returns - ------- - train_scores : array, shape (n_ticks, n_cv_folds) - Scores on training sets. - - test_scores : array, shape (n_ticks, n_cv_folds) - Scores on test set. - - Notes - ----- - See - :ref:`examples/plot_validation_curve.py ` - """ - X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) - cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) - scorer = check_scoring(estimator, scoring=scoring) - - parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, - verbose=verbose) - out = parallel(delayed(_fit_and_score)( - estimator, X, y, scorer, train, test, verbose, - parameters={param_name: v}, fit_params=None, return_train_score=True) - for train, test in cv for v in param_range) - - out = np.asarray(out)[:, :2] - n_params = len(param_range) - n_cv_folds = out.shape[0] // n_params - out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0)) - - return out[0], out[1] +#TODO: issue warning when importing this module diff --git a/sklearn/linear_model/coordinate_descent.py b/sklearn/linear_model/coordinate_descent.py index 7009f4861b375..ad0e08edc5a9a 100644 --- a/sklearn/linear_model/coordinate_descent.py +++ b/sklearn/linear_model/coordinate_descent.py @@ -16,7 +16,7 @@ from ..base import RegressorMixin from .base import center_data, sparse_center_data from ..utils import array2d, atleast2d_or_csc -from ..cross_validation import _check_cv as check_cv +from ..model_selection.partition import _check_cv as check_cv from ..externals.joblib import Parallel, delayed from ..externals import six from ..externals.six.moves import xrange diff --git a/sklearn/linear_model/least_angle.py b/sklearn/linear_model/least_angle.py index 4d52580311db5..8f9909fde5420 100644 --- a/sklearn/linear_model/least_angle.py +++ b/sklearn/linear_model/least_angle.py @@ -22,7 +22,7 @@ from .base import LinearModel from ..base import RegressorMixin from ..utils import array2d, arrayfuncs, as_float_array, check_arrays -from ..cross_validation import _check_cv as check_cv +from ..model_selection.partition import _check_cv as check_cv from ..utils import ConvergenceWarning from ..externals.joblib import Parallel, delayed from ..externals.six.moves import xrange diff --git a/sklearn/linear_model/omp.py b/sklearn/linear_model/omp.py index 8b0cbf53e0905..043dd0a91a165 100644 --- a/sklearn/linear_model/omp.py +++ b/sklearn/linear_model/omp.py @@ -15,7 +15,7 @@ from .base import LinearModel, _pre_fit from ..base import RegressorMixin from ..utils import array2d, as_float_array, check_arrays -from ..cross_validation import _check_cv as check_cv +from ..model_selection.partition import _check_cv as check_cv from ..externals.joblib import Parallel, delayed import scipy diff --git a/sklearn/model_selection/__init__.py b/sklearn/model_selection/__init__.py new file mode 100644 index 0000000000000..c790b1667582f --- /dev/null +++ b/sklearn/model_selection/__init__.py @@ -0,0 +1,28 @@ +from .partition import LeaveOneOut, LeavePOut, KFold, \ + StratifiedKFold, LeaveOneLabelOut, LeavePLabelOut, \ + ShuffleSplit, StratifiedShuffleSplit, train_test_split, check_cv +from .validate import cross_val_score, permutation_test_score, \ + learning_curve, validation_curve +from .search import GridSearchCV, RandomizedSearchCV +from .utils import ParameterGrid, ParameterSampler, fit_grid_point + +__all__ = ['KFold', + 'LeaveOneLabelOut', + 'LeaveOneOut', + 'LeavePLabelOut', + 'LeavePOut', + 'ShuffleSplit', + 'StratifiedKFold', + 'StratifiedShuffleSplit', + 'check_cv', + 'cross_val_score', + 'permutation_test_score', + 'train_test_split', + 'learning_curve', + 'validation_curve', + 'GridSearchCV', + 'RandomizedSearchCV', + 'ParameterGrid', + 'ParameterSampler', + 'fit_grid_point', + ] diff --git a/sklearn/model_selection/partition.py b/sklearn/model_selection/partition.py new file mode 100644 index 0000000000000..510cc8ac7a7c5 --- /dev/null +++ b/sklearn/model_selection/partition.py @@ -0,0 +1,1108 @@ +""" +The :mod:`sklearn.model_selection.partition` module includes +""" +#TODO Complete docstring + +# Author: Alexandre Gramfort , +# Gael Varoquaux , +# Olivier Grisel +# License: BSD 3 clause + +from __future__ import print_function +from __future__ import division + +import warnings +from itertools import chain, combinations +from math import ceil, floor, factorial +import numbers +from abc import ABCMeta, abstractmethod + +import numpy as np +import scipy.sparse as sp + +from sklearn.utils import check_arrays, check_random_state +from sklearn.externals.six import with_metaclass +from sklearn.externals.six.moves import zip + + +class _PartitionIterator(with_metaclass(ABCMeta)): + """Base class for CV iterators where train_mask = ~test_mask + + Implementations must define `_iter_test_masks` or `_iter_test_indices`. + + Parameters + ---------- + n : int + Total number of elements in dataset. + """ + + def __init__(self, n=None, indices=None): + if indices is None: + indices = True + else: + warnings.warn("The indices parameter is deprecated and will be " + "removed (assumed True) in 0.17", DeprecationWarning, + stacklevel=1) + if n is not None: + warnings.warn("The n parameter is deprecated and will be " + "removed (use split method instead)", + DeprecationWarning, stacklevel=1) + if abs(n - int(n)) >= np.finfo('f').eps: + raise ValueError("n must be an integer") + n = int(n) + self.n = n + self._indices = indices + + @property + def indices(self): + warnings.warn("The indices attribute is deprecated and will be " + "removed (assumed True) in 0.17", DeprecationWarning, + stacklevel=1) + return self._indices + + def __iter__(self): + #TODO: deprecation warning + if self.n is None: + raise ValueError("Cannot iterate dataless CV iterator") + return self.split(None) + + def split(self, y): + self._pre_split_check(y) + indices = self._indices + if indices: + ind = np.arange(self._sample_size(y)) + for test_index in self._iter_test_masks(y): + train_index = np.logical_not(test_index) + if indices: + train_index = ind[train_index] + test_index = ind[test_index] + yield train_index, test_index + + def _pre_split_check(self, y): + pass + + def _sample_size(self, y): + return self.n if y is None else len(y) #TODO: Check for dict of arrays or dataframe + + # Since subclasses must implement either _iter_test_masks or + # _iter_test_indices, neither can be abstract. + def _iter_test_masks(self, y): + """Generates boolean masks corresponding to test sets. + + By default, delegates to _iter_test_indices() + """ + for test_index in self._iter_test_indices(y): + test_mask = self._empty_mask(y) + test_mask[test_index] = True + yield test_mask + + def _iter_test_indices(self, y): + """Generates integer indices corresponding to test sets.""" + raise NotImplementedError + + def _empty_mask(self, y): + return np.zeros(self._sample_size(y), dtype=np.bool) + + +class LeaveOneOut(_PartitionIterator): + """Leave-One-Out cross validation iterator. + + Provides train/test indices to split data in train test sets. Each + sample is used once as a test set (singleton) while the remaining + samples form the training set. + + Note: ``LeaveOneOut(n)`` is equivalent to ``KFold(n, n_folds=n)`` and + ``LeavePOut(n, p=1)``. + + Due to the high number of test sets (which is the same as the + number of samples) this cross validation method can be very costly. + For large datasets one should favor KFold, StratifiedKFold or + ShuffleSplit. + + Parameters + ---------- + n : int + Total number of elements in dataset. + + Examples + -------- + >>> from sklearn import model_selection + >>> X = np.array([[1, 2], [3, 4]]) + >>> y = np.array([1, 2]) + >>> loo = model_selection.LeaveOneOut(2) + >>> len(loo) + 2 + >>> print(loo) + sklearn.model_selection.partition.LeaveOneOut(n=2) + >>> for train_index, test_index in loo: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) + TRAIN: [1] TEST: [0] + [[3 4]] [[1 2]] [2] [1] + TRAIN: [0] TEST: [1] + [[1 2]] [[3 4]] [1] [2] + + See also + -------- + LeaveOneLabelOut for splitting the data according to explicit, + domain-specific stratification of the dataset. + """ + + def _iter_test_indices(self, y): + return range(self._sample_size(y)) + + def __repr__(self): + return '%s.%s(n=%i)' % ( + self.__class__.__module__, + self.__class__.__name__, + self.n, + ) + + def __len__(self): + # TODO: remove? + return self.n + + +class LeavePOut(_PartitionIterator): + """Leave-P-Out cross validation iterator + + Provides train/test indices to split data in train test sets. This results + in testing on all distinct samples of size p, while the remaining n - p + samples form the training set in each iteration. + + Note: ``LeavePOut(n, p)`` is NOT equivalent to ``KFold(n, n_folds=n // p)`` + which creates non-overlapping test sets. + + Due to the high number of iterations which grows combinatorically with the + number of samples this cross validation method can be very costly. For + large datasets one should favor KFold, StratifiedKFold or ShuffleSplit. + + Parameters + ---------- + n : int + Total number of elements in dataset. + + p : int + Size of the test sets. + + Examples + -------- + >>> from sklearn import model_selection + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 3, 4]) + >>> lpo = model_selection.LeavePOut(4, 2) + >>> len(lpo) + 6 + >>> print(lpo) + sklearn.model_selection.partition.LeavePOut(n=4, p=2) + >>> for train_index, test_index in lpo: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [2 3] TEST: [0 1] + TRAIN: [1 3] TEST: [0 2] + TRAIN: [1 2] TEST: [0 3] + TRAIN: [0 3] TEST: [1 2] + TRAIN: [0 2] TEST: [1 3] + TRAIN: [0 1] TEST: [2 3] + """ + + def __init__(self, n=None, p=None, indices=None): + super(LeavePOut, self).__init__(n, indices) + if p is None: + raise ValueError("LeavePOut: must supply p") + self.p = p + + def _iter_test_indices(self, y): + for comb in combinations(range(self._sample_size(y)), self.p): + yield np.array(comb) + + def __repr__(self): + return '%s.%s(n=%i, p=%i)' % ( + self.__class__.__module__, + self.__class__.__name__, + self.n, + self.p, + ) + + def __len__(self): + # TODO: remove? + return int(factorial(self.n) / factorial(self.n - self.p) + / factorial(self.p)) + + +class _BaseKFold(with_metaclass(ABCMeta, _PartitionIterator)): + """Base class to validate KFold approaches""" + + @abstractmethod + def __init__(self, n, n_folds, indices, shuffle, random_state): + super(_BaseKFold, self).__init__(n, indices) + + if abs(n_folds - int(n_folds)) >= np.finfo('f').eps: + raise ValueError("n_folds must be an integer") + self.n_folds = n_folds = int(n_folds) + + if n_folds <= 1: + raise ValueError( + "k-fold cross validation requires at least one" + " train / test split by setting n_folds=2 or more," + " got n_folds={0}.".format(n_folds)) + + if not isinstance(shuffle, bool): + raise TypeError("shuffle must be True or False;" + " got {0}".format(shuffle)) + self.shuffle = shuffle + self.random_state = random_state + + def _pre_split_check(self, y): + n = self._sample_size(y) + if self.n_folds > n: + raise ValueError( + ("Cannot have number of folds n_folds={0} greater" + " than the number of samples: {1}.").format(self.n_folds, n)) + + +class KFold(_BaseKFold): + """K-Folds cross validation iterator. + + Provides train/test indices to split data in train test sets. Split + dataset into k consecutive folds (without shuffling). + + Each fold is then used a validation set once while the k - 1 remaining + fold form the training set. + + Parameters + ---------- + n : int + Total number of elements. + + n_folds : int, default=3 + Number of folds. Must be at least 2. + + shuffle : boolean, optional + Whether to shuffle the data before splitting into batches. + + random_state : None, int or RandomState + Pseudo-random number generator state used for random + sampling. If None, use default numpy RNG for shuffling + + Examples + -------- + >>> from sklearn import model_selection + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([1, 2, 3, 4]) + >>> kf = model_selection.KFold(4, n_folds=2) + >>> len(kf) + 2 + >>> print(kf) # doctest: +NORMALIZE_WHITESPACE + sklearn.model_selection.partition.KFold(n=4, n_folds=2, shuffle=False, + random_state=None) + >>> for train_index, test_index in kf: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [2 3] TEST: [0 1] + TRAIN: [0 1] TEST: [2 3] + + Notes + ----- + The first n % n_folds folds have size n // n_folds + 1, other folds have + size n // n_folds. + + See also + -------- + StratifiedKFold: take label information into account to avoid building + folds with imbalanced class distributions (for binary or multiclass + classification tasks). + """ + + def __init__(self, n=None, n_folds=3, indices=None, shuffle=False, + random_state=None): + super(KFold, self).__init__(n, n_folds, indices, shuffle, random_state) + + def _iter_test_indices(self, y): + n = self._sample_size(y) + idxs = np.arange(n) + if self.shuffle: + rng = check_random_state(self.random_state) + rng.shuffle(idxs) + n_folds = self.n_folds + fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int) + fold_sizes[:n % n_folds] += 1 + current = 0 + for fold_size in fold_sizes: + start, stop = current, current + fold_size + yield idxs[start:stop] + current = stop + + def __repr__(self): + return '%s.%s(n=%i, n_folds=%i, shuffle=%s, random_state=%s)' % ( + self.__class__.__module__, + self.__class__.__name__, + self.n, + self.n_folds, + self.shuffle, + self.random_state, + ) + + def __len__(self): + return self.n_folds + + +class StratifiedKFold(_BaseKFold): + """Stratified K-Folds cross validation iterator + + Provides train/test indices to split data in train test sets. + + This cross-validation object is a variation of KFold that + returns stratified folds. The folds are made by preserving + the percentage of samples for each class. + + Parameters + ---------- + y : array-like, [n_samples] + Samples to split in K folds. + + n_folds : int, default=3 + Number of folds. Must be at least 2. + + shuffle : boolean, optional + Whether to shuffle each stratification of the data before splitting + into batches. + + random_state : None, int or RandomState + Pseudo-random number generator state used for random + sampling. If None, use default numpy RNG for shuffling + + Examples + -------- + >>> from sklearn import model_selection + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> skf = model_selection.StratifiedKFold(y, n_folds=2) + >>> len(skf) + 2 + >>> print(skf) # doctest: +NORMALIZE_WHITESPACE + sklearn.model_selection.partition.StratifiedKFold(labels=[0 0 1 1], n_folds=2, + shuffle=False, random_state=None) + >>> for train_index, test_index in skf: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [1 3] TEST: [0 2] + TRAIN: [0 2] TEST: [1 3] + + Notes + ----- + All the folds have size trunc(n_samples / n_folds), the last one has the + complementary. + + """ + + def __init__(self, y=None, n_folds=3, indices=None, shuffle=False, + random_state=None): + #TODO: deprecation warning if y is not none, should use split + n = len(y) if y is not None else None + super(StratifiedKFold, self).__init__( + n, n_folds, indices, shuffle, random_state) + self.y = np.asarray(y) if y is not None else None + + def _make_test_folds(self, y): + if y is None: + if self.y is None: + raise ValueError("Must supply y in constructor or split") + y = self.y + + n_samples = y.shape[0] + unique_labels, y_inversed = np.unique(y, return_inverse=True) + label_counts = np.bincount(y_inversed) + min_labels = np.min(label_counts) + if self.n_folds > min_labels: + warnings.warn(("The least populated class in y has only %d" + " members, which is too few. The minimum" + " number of labels for any class cannot" + " be less than n_folds=%d." + % (min_labels, self.n_folds)), Warning) + + # don't want to use the same seed in each label's shuffle + if self.shuffle: + rng = check_random_state(self.random_state) + else: + rng = self.random_state + + # pre-assign each sample to a test fold index using individual KFold + # splitting strategies for each label so as to respect the + # balance of labels + per_label_cvs = [ + KFold(max(c, self.n_folds), self.n_folds, shuffle=self.shuffle, + random_state=rng) for c in label_counts] + test_folds = np.zeros(n_samples, dtype=np.int) + for test_fold_idx, per_label_splits in enumerate(zip(*per_label_cvs)): + for label, (_, test_split) in zip(unique_labels, per_label_splits): + label_test_folds = test_folds[y == label] + # the test split can be too big because we used + # KFold(max(c, self.n_folds), self.n_folds) instead of + # KFold(c, self.n_folds) to make it possible to not crash even + # if the data is not 100% stratifiable for all the labels + # (we use a warning instead of raising an exception) + # If this is the case, let's trim it: + test_split = test_split[test_split < len(label_test_folds)] + label_test_folds[test_split] = test_fold_idx + test_folds[y == label] = label_test_folds + + return test_folds + + def _iter_test_masks(self, y): + test_folds = self._make_test_folds(y) + for i in range(self.n_folds): + yield test_folds == i + + def __repr__(self): + return '%s.%s(labels=%s, n_folds=%i, shuffle=%s, random_state=%s)' % ( + self.__class__.__module__, + self.__class__.__name__, + self.y, + self.n_folds, + self.shuffle, + self.random_state, + ) + + def __len__(self): + return self.n_folds + + +class LeaveOneLabelOut(_PartitionIterator): + """Leave-One-Label_Out cross-validation iterator + + Provides train/test indices to split data according to a third-party + provided label. This label information can be used to encode arbitrary + domain specific stratifications of the samples as integers. + + For instance the labels could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + Parameters + ---------- + labels : array-like of int with shape (n_samples,) + Arbitrary domain-specific stratification of the data to be used + to draw the splits. + + Examples + -------- + >>> from sklearn import model_selection + >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) + >>> y = np.array([1, 2, 1, 2]) + >>> labels = np.array([1, 1, 2, 2]) + >>> lol = model_selection.LeaveOneLabelOut(labels) + >>> len(lol) + 2 + >>> print(lol) + sklearn.model_selection.partition.LeaveOneLabelOut(labels=[1 1 2 2]) + >>> for train_index, test_index in lol: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) + TRAIN: [2 3] TEST: [0 1] + [[5 6] + [7 8]] [[1 2] + [3 4]] [1 2] [1 2] + TRAIN: [0 1] TEST: [2 3] + [[1 2] + [3 4]] [[5 6] + [7 8]] [1 2] [1 2] + + """ + + def __init__(self, labels=None, indices=None): + n = None if labels is None else len(labels) + super(LeaveOneLabelOut, self).__init__(n, indices) + self.labels = labels + + def _iter_test_masks(self, y): + labels = self.labels if y is None else y + # We make a copy of labels to avoid side-effects during iteration + labels = np.array(labels, copy=True) + unique_labels = np.unique(labels) + n_unique_labels = len(unique_labels) + for i in unique_labels: + print("yielding", labels == i) + yield labels == i + + def __repr__(self): + return '%s.%s(labels=%s)' % ( + self.__class__.__module__, + self.__class__.__name__, + self.labels, + ) + + def __len__(self): + # TODO: remove? + return self.n_unique_labels + + +class LeavePLabelOut(_PartitionIterator): + """Leave-P-Label_Out cross-validation iterator + + Provides train/test indices to split data according to a third-party + provided label. This label information can be used to encode arbitrary + domain specific stratifications of the samples as integers. + + For instance the labels could be the year of collection of the samples + and thus allow for cross-validation against time-based splits. + + The difference between LeavePLabelOut and LeaveOneLabelOut is that + the former builds the test sets with all the samples assigned to + ``p`` different values of the labels while the latter uses samples + all assigned the same labels. + + Parameters + ---------- + labels : array-like of int with shape (n_samples,) + Arbitrary domain-specific stratification of the data to be used + to draw the splits. + + p : int + Number of samples to leave out in the test split. + + Examples + -------- + >>> from sklearn import model_selection + >>> X = np.array([[1, 2], [3, 4], [5, 6]]) + >>> y = np.array([1, 2, 1]) + >>> labels = np.array([1, 2, 3]) + >>> lpl = model_selection.LeavePLabelOut(labels, p=2) + >>> len(lpl) + 3 + >>> print(lpl) + sklearn.model_selection.partition.LeavePLabelOut(labels=[1 2 3], p=2) + >>> for train_index, test_index in lpl: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + ... print(X_train, X_test, y_train, y_test) + TRAIN: [2] TEST: [0 1] + [[5 6]] [[1 2] + [3 4]] [1] [1 2] + TRAIN: [1] TEST: [0 2] + [[3 4]] [[1 2] + [5 6]] [2] [1 1] + TRAIN: [0] TEST: [1 2] + [[1 2]] [[3 4] + [5 6]] [1] [2 1] + """ + + def __init__(self, labels=None, p=None, indices=None): + n = None if labels is None else len(labels) + super(LeavePLabelOut, self).__init__(n, indices) + if p is None: + raise ValueError("LeavePLabelOut: must supply p") + + self.p = p + self.labels = labels + + def _iter_test_masks(self, y): + labels = self.labels if y is None else y + # We make a copy of labels to avoid side-effects during iteration + labels = np.array(labels, copy=True) + unique_labels = np.unique(labels) + n_unique_labels = len(unique_labels) + comb = combinations(range(n_unique_labels), self.p) + for idx in comb: + test_index = self._empty_mask(labels) + idx = np.array(idx) + for l in unique_labels[idx]: + test_index[labels == l] = True + yield test_index + + def __repr__(self): + return '%s.%s(labels=%s, p=%s)' % ( + self.__class__.__module__, + self.__class__.__name__, + self.labels, + self.p, + ) + + def __len__(self): + return int(factorial(self.n_unique_labels) / + factorial(self.n_unique_labels - self.p) / + factorial(self.p)) + + +def train_test_split(*arrays, **options): + """Split arrays or matrices into random train and test subsets + + Quick utility that wraps calls to ``check_arrays`` and + ``next(iter(ShuffleSplit(n_samples)))`` and application to input + data into a single call for splitting (and optionally subsampling) + data in a oneliner. + + Parameters + ---------- + *arrays : sequence of arrays or scipy.sparse matrices with same shape[0] + Python lists or tuples occurring in arrays are converted to 1D numpy + arrays. + + test_size : float, int, or None (default is None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If + int, represents the absolute number of test samples. If None, + the value is automatically set to the complement of the train size. + If train size is also None, test size is set to 0.25. + + train_size : float, int, or None (default is None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int or RandomState + Pseudo-random number generator state used for random sampling. + + dtype : a numpy dtype instance, None by default + Enforce a specific dtype. + + Returns + ------- + splitting : list of arrays, length=2 * len(arrays) + List containing train-test split of input array. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.model_selection.partition import train_test_split + >>> a, b = np.arange(10).reshape((5, 2)), range(5) + >>> a + array([[0, 1], + [2, 3], + [4, 5], + [6, 7], + [8, 9]]) + >>> list(b) + [0, 1, 2, 3, 4] + + >>> a_train, a_test, b_train, b_test = train_test_split( + ... a, b, test_size=0.33, random_state=42) + ... + >>> a_train + array([[4, 5], + [0, 1], + [6, 7]]) + >>> b_train + array([2, 0, 3]) + >>> a_test + array([[2, 3], + [8, 9]]) + >>> b_test + array([1, 4]) + + """ + n_arrays = len(arrays) + if n_arrays == 0: + raise ValueError("At least one array required as input") + + test_size = options.pop('test_size', None) + train_size = options.pop('train_size', None) + random_state = options.pop('random_state', None) + options['sparse_format'] = 'csr' + options['allow_nans'] = True + + if test_size is None and train_size is None: + test_size = 0.25 + + arrays = check_arrays(*arrays, **options) + n_samples = arrays[0].shape[0] + cv = ShuffleSplit(n_samples, test_size=test_size, + train_size=train_size, + random_state=random_state) + + train, test = next(iter(cv)) + return list(chain.from_iterable((a[train], a[test]) for a in arrays)) + + +train_test_split.__test__ = False # to avoid a pb with nosetests + + +class BaseShuffleSplit(with_metaclass(ABCMeta)): + """Base class for ShuffleSplit and StratifiedShuffleSplit""" + + def __init__(self, n, n_iter=10, test_size=0.1, train_size=None, + indices=None, random_state=None, n_iterations=None): + if indices is None: + indices = True + else: + warnings.warn("The indices parameter is deprecated and will be " + "removed (assumed True) in 0.17", DeprecationWarning) + self.n = n + self.n_iter = n_iter + if n_iterations is not None: # pragma: no cover + warnings.warn("n_iterations was renamed to n_iter for consistency " + " and will be removed in 0.16.") + self.n_iter = n_iterations + self.test_size = test_size + self.train_size = train_size + self.random_state = random_state + self._indices = indices + self.n_train, self.n_test = _validate_shuffle_split(n, + test_size, + train_size) + + @property + def indices(self): + warnings.warn("The indices attribute is deprecated and will be " + "removed (assumed True) in 0.17", DeprecationWarning, + stacklevel=1) + return self._indices + + def __iter__(self): + if self._indices: + for train, test in self._iter_indices(): + yield train, test + return + for train, test in self._iter_indices(): + train_m = np.zeros(self.n, dtype=bool) + test_m = np.zeros(self.n, dtype=bool) + train_m[train] = True + test_m[test] = True + yield train_m, test_m + + @abstractmethod + def _iter_indices(self): + """Generate (train, test) indices""" + + +class ShuffleSplit(BaseShuffleSplit): + """Random permutation cross-validation iterator. + + Yields indices to split data into training and test sets. + + Note: contrary to other cross-validation strategies, random splits + do not guarantee that all folds will be different, although this is + still very likely for sizeable datasets. + + Parameters + ---------- + n : int + Total number of elements in the dataset. + + n_iter : int (default 10) + Number of re-shuffling & splitting iterations. + + test_size : float (default 0.1), int, or None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If + int, represents the absolute number of test samples. If None, + the value is automatically set to the complement of the train size. + + train_size : float, int, or None (default is None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int or RandomState + Pseudo-random number generator state used for random sampling. + + Examples + -------- + >>> from sklearn import cross_validation + >>> rs = cross_validation.ShuffleSplit(4, n_iter=3, + ... test_size=.25, random_state=0) + >>> len(rs) + 3 + >>> print(rs) + ... # doctest: +ELLIPSIS + ShuffleSplit(4, n_iter=3, test_size=0.25, ...) + >>> for train_index, test_index in rs: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... + TRAIN: [3 1 0] TEST: [2] + TRAIN: [2 1 3] TEST: [0] + TRAIN: [0 2 1] TEST: [3] + + >>> rs = cross_validation.ShuffleSplit(4, n_iter=3, + ... train_size=0.5, test_size=.25, random_state=0) + >>> for train_index, test_index in rs: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... + TRAIN: [3 1] TEST: [2] + TRAIN: [2 1] TEST: [0] + TRAIN: [0 2] TEST: [3] + + See also + -------- + Bootstrap: cross-validation using re-sampling with replacement. + """ + + def _iter_indices(self): + rng = check_random_state(self.random_state) + for i in range(self.n_iter): + # random partition + permutation = rng.permutation(self.n) + ind_test = permutation[:self.n_test] + ind_train = permutation[self.n_test:self.n_test + self.n_train] + yield ind_train, ind_test + + def __repr__(self): + return ('%s(%d, n_iter=%d, test_size=%s, ' + 'random_state=%s)' % ( + self.__class__.__name__, + self.n, + self.n_iter, + str(self.test_size), + self.random_state, + )) + + def __len__(self): + return self.n_iter + + +def _validate_shuffle_split(n, test_size, train_size): + if test_size is None and train_size is None: + raise ValueError( + 'test_size and train_size can not both be None') + + if test_size is not None: + if np.asarray(test_size).dtype.kind == 'f': + if test_size >= 1.: + raise ValueError( + 'test_size=%f should be smaller ' + 'than 1.0 or be an integer' % test_size) + elif np.asarray(test_size).dtype.kind == 'i': + if test_size >= n: + raise ValueError( + 'test_size=%d should be smaller ' + 'than the number of samples %d' % (test_size, n)) + else: + raise ValueError("Invalid value for test_size: %r" % test_size) + + if train_size is not None: + if np.asarray(train_size).dtype.kind == 'f': + if train_size >= 1.: + raise ValueError("train_size=%f should be smaller " + "than 1.0 or be an integer" % train_size) + elif np.asarray(test_size).dtype.kind == 'f' and \ + train_size + test_size > 1.: + raise ValueError('The sum of test_size and train_size = %f, ' + 'should be smaller than 1.0. Reduce ' + 'test_size and/or train_size.' % + (train_size + test_size)) + elif np.asarray(train_size).dtype.kind == 'i': + if train_size >= n: + raise ValueError("train_size=%d should be smaller " + "than the number of samples %d" % + (train_size, n)) + else: + raise ValueError("Invalid value for train_size: %r" % train_size) + + if np.asarray(test_size).dtype.kind == 'f': + n_test = ceil(test_size * n) + elif np.asarray(test_size).dtype.kind == 'i': + n_test = float(test_size) + + if train_size is None: + n_train = n - n_test + else: + if np.asarray(train_size).dtype.kind == 'f': + n_train = floor(train_size * n) + else: + n_train = float(train_size) + + if test_size is None: + n_test = n - n_train + + if n_train + n_test > n: + raise ValueError('The sum of train_size and test_size = %d, ' + 'should be smaller than the number of ' + 'samples %d. Reduce test_size and/or ' + 'train_size.' % (n_train + n_test, n)) + + return int(n_train), int(n_test) + + +class StratifiedShuffleSplit(BaseShuffleSplit): + """Stratified ShuffleSplit cross validation iterator + + Provides train/test indices to split data in train test sets. + + This cross-validation object is a merge of StratifiedKFold and + ShuffleSplit, which returns stratified randomized folds. The folds + are made by preserving the percentage of samples for each class. + + Note: like the ShuffleSplit strategy, stratified random splits + do not guarantee that all folds will be different, although this is + still very likely for sizeable datasets. + + Parameters + ---------- + y : array, [n_samples] + Labels of samples. + + n_iter : int (default 10) + Number of re-shuffling & splitting iterations. + + test_size : float (default 0.1), int, or None + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the test split. If + int, represents the absolute number of test samples. If None, + the value is automatically set to the complement of the train size. + + train_size : float, int, or None (default is None) + If float, should be between 0.0 and 1.0 and represent the + proportion of the dataset to include in the train split. If + int, represents the absolute number of train samples. If None, + the value is automatically set to the complement of the test size. + + random_state : int or RandomState + Pseudo-random number generator state used for random sampling. + + Examples + -------- + >>> from sklearn.cross_validation import StratifiedShuffleSplit + >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]]) + >>> y = np.array([0, 0, 1, 1]) + >>> sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0) + >>> len(sss) + 3 + >>> print(sss) # doctest: +ELLIPSIS + StratifiedShuffleSplit(labels=[0 0 1 1], n_iter=3, ...) + >>> for train_index, test_index in sss: + ... print("TRAIN:", train_index, "TEST:", test_index) + ... X_train, X_test = X[train_index], X[test_index] + ... y_train, y_test = y[train_index], y[test_index] + TRAIN: [1 2] TEST: [3 0] + TRAIN: [0 2] TEST: [1 3] + TRAIN: [0 2] TEST: [3 1] + """ + + def __init__(self, y, n_iter=10, test_size=0.1, train_size=None, + indices=None, random_state=None, n_iterations=None): + + super(StratifiedShuffleSplit, self).__init__( + len(y), n_iter, test_size, train_size, indices, random_state, + n_iterations) + self.y = np.array(y) + self.classes, self.y_indices = np.unique(y, return_inverse=True) + n_cls = self.classes.shape[0] + + if np.min(np.bincount(self.y_indices)) < 2: + raise ValueError("The least populated class in y has only 1" + " member, which is too few. The minimum" + " number of labels for any class cannot" + " be less than 2.") + + if self.n_train < n_cls: + raise ValueError('The train_size = %d should be greater or ' + 'equal to the number of classes = %d' % + (self.n_train, n_cls)) + if self.n_test < n_cls: + raise ValueError('The test_size = %d should be greater or ' + 'equal to the number of classes = %d' % + (self.n_test, n_cls)) + + def _iter_indices(self): + rng = check_random_state(self.random_state) + cls_count = np.bincount(self.y_indices) + p_i = cls_count / float(self.n) + n_i = np.round(self.n_train * p_i).astype(int) + t_i = np.minimum(cls_count - n_i, + np.round(self.n_test * p_i).astype(int)) + + for n in range(self.n_iter): + train = [] + test = [] + + for i, cls in enumerate(self.classes): + permutation = rng.permutation(cls_count[i]) + cls_i = np.where((self.y == cls))[0][permutation] + + train.extend(cls_i[:n_i[i]]) + test.extend(cls_i[n_i[i]:n_i[i] + t_i[i]]) + + # Because of rounding issues (as n_train and n_test are not + # dividers of the number of elements per class), we may end + # up here with less samples in train and test than asked for. + if len(train) < self.n_train or len(test) < self.n_test: + # We complete by affecting randomly the missing indexes + missing_idx = np.where(np.bincount(train + test, + minlength=len(self.y)) == 0, + )[0] + missing_idx = rng.permutation(missing_idx) + train.extend(missing_idx[:(self.n_train - len(train))]) + test.extend(missing_idx[-(self.n_test - len(test)):]) + + train = rng.permutation(train) + test = rng.permutation(test) + + yield train, test + + def __repr__(self): + return ('%s(labels=%s, n_iter=%d, test_size=%s, ' + 'random_state=%s)' % ( + self.__class__.__name__, + self.y, + self.n_iter, + str(self.test_size), + self.random_state, + )) + + def __len__(self): + return self.n_iter + + +def check_cv(cv, X=None, y=None, classifier=False): + """Input checker utility for building a CV in a user friendly way. + + Parameters + ---------- + cv : int, a cv generator instance, or None + The input specifying which cv generator to use. It can be an + integer, in which case it is the number of folds in a KFold, + None, in which case 3 fold is used, or another object, that + will then be used as a cv generator. + + X : array-like + The data the cross-val object will be applied on. + + y : array-like + The target variable for a supervised learning problem. + + classifier : boolean optional + Whether the task is a classification task, in which case + stratified KFold will be used. + + Returns + ------- + checked_cv: a cross-validation generator instance. + The return value is guaranteed to be a cv generator instance, whatever + the input type. + """ + return _check_cv(cv, X=X, y=y, classifier=classifier, warn_mask=True) + + +def _check_cv(cv, X=None, y=None, classifier=False, warn_mask=False): + # This exists for internal use while indices is being deprecated. + is_sparse = sp.issparse(X) + needs_indices = is_sparse or not hasattr(X, "shape") + if cv is None: + cv = 3 + if isinstance(cv, numbers.Integral): + if warn_mask and not needs_indices: + warnings.warn('check_cv will return indices instead of boolean ' + 'masks from 0.17', DeprecationWarning) + else: + needs_indices = None + if classifier: + cv = StratifiedKFold(y, cv, indices=needs_indices) + else: + if not is_sparse: + n_samples = len(X) + else: + n_samples = X.shape[0] + cv = KFold(n_samples, cv, indices=needs_indices) + if needs_indices and not getattr(cv, "_indices", True): + raise ValueError("Sparse data and lists require indices-based cross" + " validation generator, got: %r", cv) + return cv diff --git a/sklearn/model_selection/scoring.py b/sklearn/model_selection/scoring.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/sklearn/model_selection/search.py b/sklearn/model_selection/search.py new file mode 100644 index 0000000000000..878c7157398c0 --- /dev/null +++ b/sklearn/model_selection/search.py @@ -0,0 +1,548 @@ +""" +The :mod:`sklearn.model_selection.search` includes utilities to fine-tune the +parameters of an estimator. +""" + +from __future__ import print_function + +# Author: Alexandre Gramfort , +# Gael Varoquaux +# Andreas Mueller +# Olivier Grisel +# License: BSD 3 clause + +from abc import ABCMeta, abstractmethod +from collections import namedtuple, Sized + +import numpy as np + +from sklearn.base import BaseEstimator, is_classifier, clone +from sklearn.base import MetaEstimatorMixin +from sklearn.externals.joblib import Parallel, delayed +from sklearn.externals import six +from sklearn.utils.validation import _num_samples, check_arrays +from sklearn.metrics.scorer import check_scoring +from .partition import _check_cv as check_cv +from .validate import _fit_and_score +from .utils import ParameterGrid, ParameterSampler + + +__all__ = ['GridSearchCV', 'RandomizedSearchCV'] + + +def _check_param_grid(param_grid): + if hasattr(param_grid, 'items'): + param_grid = [param_grid] + + for p in param_grid: + for v in p.values(): + if isinstance(v, np.ndarray) and v.ndim > 1: + raise ValueError("Parameter array should be one-dimensional.") + + check = [isinstance(v, k) for k in (list, tuple, np.ndarray)] + if not True in check: + raise ValueError("Parameter values should be a list.") + + if len(v) == 0: + raise ValueError("Parameter values should be a non-empty " + "list.") + + +class _CVScoreTuple (namedtuple('_CVScoreTuple', + ('parameters', + 'mean_validation_score', + 'cv_validation_scores'))): + # A raw namedtuple is very memory efficient as it packs the attributes + # in a struct to get rid of the __dict__ of attributes in particular it + # does not copy the string for the keys on each instance. + # By deriving a namedtuple class just to introduce the __repr__ method we + # would also reintroduce the __dict__ on the instance. By telling the + # Python interpreter that this subclass uses static __slots__ instead of + # dynamic attributes. Furthermore we don't need any additional slot in the + # subclass so we set __slots__ to the empty tuple. + __slots__ = () + + def __repr__(self): + """Simple custom repr to summarize the main info""" + return "mean: {0:.5f}, std: {1:.5f}, params: {2}".format( + self.mean_validation_score, + np.std(self.cv_validation_scores), + self.parameters) + + +class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, + MetaEstimatorMixin)): + """Base class for hyper parameter search with cross-validation.""" + + @abstractmethod + def __init__(self, estimator, scoring=None, loss_func=None, + score_func=None, fit_params=None, n_jobs=1, iid=True, + refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'): + + self.scoring = scoring + self.estimator = estimator + self.loss_func = loss_func + self.score_func = score_func + self.n_jobs = n_jobs + self.fit_params = fit_params if fit_params is not None else {} + self.iid = iid + self.refit = refit + self.cv = cv + self.verbose = verbose + self.pre_dispatch = pre_dispatch + + def score(self, X, y=None): + """Returns the score on the given test data and labels, if the search + estimator has been refit. The ``score`` function of the best estimator + is used, or the ``scoring`` parameter where unavailable. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Input data, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + + Returns + ------- + score : float + + """ + if hasattr(self.best_estimator_, 'score'): + return self.best_estimator_.score(X, y) + if self.scorer_ is None: + raise ValueError("No score function explicitly defined, " + "and the estimator doesn't provide one %s" + % self.best_estimator_) + return self.scorer_(self.best_estimator_, X, y) + + @property + def predict(self): + return self.best_estimator_.predict + + @property + def predict_proba(self): + return self.best_estimator_.predict_proba + + @property + def decision_function(self): + return self.best_estimator_.decision_function + + @property + def transform(self): + return self.best_estimator_.transform + + def _fit(self, X, y, parameter_iterable): + """Actual fitting, performing the search over parameters.""" + + estimator = self.estimator + cv = self.cv + self.scorer_ = check_scoring(self.estimator, scoring=self.scoring, + loss_func=self.loss_func, + score_func=self.score_func) + + n_samples = _num_samples(X) + X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr', + allow_nans=True) + + if y is not None: + if len(y) != n_samples: + raise ValueError('Target variable (y) has a different number ' + 'of samples (%i) than data (X: %i samples)' + % (len(y), n_samples)) + y = np.asarray(y) + cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) + + if self.verbose > 0: + if isinstance(parameter_iterable, Sized): + n_candidates = len(parameter_iterable) + print("Fitting {0} folds for each of {1} candidates, totalling" + " {2} fits".format(len(cv), n_candidates, + n_candidates * len(cv))) + + base_estimator = clone(self.estimator) + + pre_dispatch = self.pre_dispatch + + out = Parallel( + n_jobs=self.n_jobs, verbose=self.verbose, + pre_dispatch=pre_dispatch + )( + delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, + train, test, self.verbose, parameters, + self.fit_params, return_parameters=True) + for parameters in parameter_iterable + for train, test in cv) + + # Out is a list of triplet: score, estimator, n_test_samples + n_fits = len(out) + n_folds = len(cv) + + scores = list() + grid_scores = list() + for grid_start in range(0, n_fits, n_folds): + n_test_samples = 0 + score = 0 + all_scores = [] + for this_score, this_n_test_samples, _, parameters in \ + out[grid_start:grid_start + n_folds]: + all_scores.append(this_score) + if self.iid: + this_score *= this_n_test_samples + n_test_samples += this_n_test_samples + score += this_score + if self.iid: + score /= float(n_test_samples) + else: + score /= float(n_folds) + scores.append((score, parameters)) + # TODO: shall we also store the test_fold_sizes? + grid_scores.append(_CVScoreTuple( + parameters, + score, + np.array(all_scores))) + # Store the computed scores + self.grid_scores_ = grid_scores + + # Find the best parameters by comparing on the mean validation score: + # note that `sorted` is deterministic in the way it breaks ties + best = sorted(grid_scores, key=lambda x: x.mean_validation_score, + reverse=True)[0] + self.best_params_ = best.parameters + self.best_score_ = best.mean_validation_score + + if self.refit: + # fit the best estimator using the entire dataset + # clone first to work around broken estimators + best_estimator = clone(base_estimator).set_params( + **best.parameters) + if y is not None: + best_estimator.fit(X, y, **self.fit_params) + else: + best_estimator.fit(X, **self.fit_params) + self.best_estimator_ = best_estimator + return self + + +class GridSearchCV(BaseSearchCV): + """Exhaustive search over specified parameter values for an estimator. + + Important members are fit, predict. + + GridSearchCV implements a "fit" method and a "predict" method like + any classifier except that the parameters of the classifier + used to predict is optimized by cross-validation. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + A object of that type is instantiated for each grid point. + + param_grid : dict or list of dictionaries + Dictionary with parameters names (string) as keys and lists of + parameter settings to try as values, or a list of such + dictionaries, in which case the grids spanned by each dictionary + in the list are explored. This enables searching over any sequence + of parameter settings. + + scoring : string, callable or None, optional, default: None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + fit_params : dict, optional + Parameters to pass to the fit method. + + n_jobs : int, optional + Number of jobs to run in parallel (default 1). + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + iid : boolean, optional + If True, the data is assumed to be identically distributed across + the folds, and the loss minimized is the total loss per sample, + and not the mean loss across the folds. + + cv : integer or cross-validation generator, optional + If an integer is passed, it is the number of folds (default 3). + Specific cross-validation objects can be passed, see + sklearn.cross_validation module for the list of possible objects + + refit : boolean + Refit the best estimator with the entire dataset. + If "False", it is impossible to make predictions using + this GridSearchCV instance after fitting. + + verbose : integer + Controls the verbosity: the higher, the more messages. + + Examples + -------- + >>> from sklearn import svm, grid_search, datasets + >>> iris = datasets.load_iris() + >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} + >>> svr = svm.SVC() + >>> clf = grid_search.GridSearchCV(svr, parameters) + >>> clf.fit(iris.data, iris.target) + ... # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS + GridSearchCV(cv=None, + estimator=SVC(C=1.0, cache_size=..., class_weight=..., coef0=..., + degree=..., gamma=..., kernel='rbf', max_iter=-1, + probability=False, random_state=None, shrinking=True, + tol=..., verbose=False), + fit_params={}, iid=..., loss_func=..., n_jobs=1, + param_grid=..., pre_dispatch=..., refit=..., score_func=..., + scoring=..., verbose=...) + + + Attributes + ---------- + `grid_scores_` : list of named tuples + Contains scores for all parameter combinations in param_grid. + Each entry corresponds to one parameter setting. + Each named tuple has the attributes: + + * ``parameters``, a dict of parameter settings + * ``mean_validation_score``, the mean score over the + cross-validation folds + * ``cv_validation_scores``, the list of scores for each fold + + `best_estimator_` : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. + + `best_score_` : float + Score of best_estimator on the left out data. + + `best_params_` : dict + Parameter setting that gave the best results on the hold out data. + + `scorer_` : function + Scorer function used on the held out data to choose the best + parameters for the model. + + Notes + ------ + The parameters selected are those that maximize the score of the left out + data, unless an explicit score is passed in which case it is used instead. + + If `n_jobs` was set to a value higher than one, the data is copied for each + point in the grid (and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + + See Also + --------- + :class:`ParameterGrid`: + generates all the combinations of a an hyperparameter grid. + + :func:`sklearn.cross_validation.train_test_split`: + utility function to split the data into a development set usable + for fitting a GridSearchCV instance and an evaluation set for + its final evaluation. + + :func:`sklearn.metrics.make_scorer`: + Make a scorer from a performance metric or loss function. + + """ + + def __init__(self, estimator, param_grid, scoring=None, loss_func=None, + score_func=None, fit_params=None, n_jobs=1, iid=True, + refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'): + super(GridSearchCV, self).__init__( + estimator, scoring, loss_func, score_func, fit_params, n_jobs, iid, + refit, cv, verbose, pre_dispatch) + self.param_grid = param_grid + _check_param_grid(param_grid) + + def fit(self, X, y=None): + """Run fit with all sets of parameters. + + Parameters + ---------- + + X : array-like, shape = [n_samples, n_features] + Training vector, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + + """ + return self._fit(X, y, ParameterGrid(self.param_grid)) + + +class RandomizedSearchCV(BaseSearchCV): + """Randomized search on hyper parameters. + + RandomizedSearchCV implements a "fit" method and a "predict" method like + any classifier except that the parameters of the classifier + used to predict is optimized by cross-validation. + + In contrast to GridSearchCV, not all parameter values are tried out, but + rather a fixed number of parameter settings is sampled from the specified + distributions. The number of parameter settings that are tried is + given by n_iter. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + A object of that type is instantiated for each parameter setting. + + param_distributions : dict + Dictionary with parameters names (string) as keys and distributions + or lists of parameters to try. Distributions must provide a ``rvs`` + method for sampling (such as those from scipy.stats.distributions). + If a list is given, it is sampled uniformly. + + n_iter : int, default=10 + Number of parameter settings that are sampled. n_iter trades + off runtime vs quality of the solution. + + scoring : string, callable or None, optional, default: None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + fit_params : dict, optional + Parameters to pass to the fit method. + + n_jobs : int, optional + Number of jobs to run in parallel (default 1). + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + iid : boolean, optional + If True, the data is assumed to be identically distributed across + the folds, and the loss minimized is the total loss per sample, + and not the mean loss across the folds. + + cv : integer or cross-validation generator, optional + If an integer is passed, it is the number of folds (default 3). + Specific cross-validation objects can be passed, see + sklearn.cross_validation module for the list of possible objects + + refit : boolean + Refit the best estimator with the entire dataset. + If "False", it is impossible to make predictions using + this RandomizedSearchCV instance after fitting. + + verbose : integer + Controls the verbosity: the higher, the more messages. + + + Attributes + ---------- + `grid_scores_` : list of named tuples + Contains scores for all parameter combinations in param_grid. + Each entry corresponds to one parameter setting. + Each named tuple has the attributes: + + * ``parameters``, a dict of parameter settings + * ``mean_validation_score``, the mean score over the + cross-validation folds + * ``cv_validation_scores``, the list of scores for each fold + + `best_estimator_` : estimator + Estimator that was chosen by the search, i.e. estimator + which gave highest score (or smallest loss if specified) + on the left out data. + + `best_score_` : float + Score of best_estimator on the left out data. + + `best_params_` : dict + Parameter setting that gave the best results on the hold out data. + + Notes + ----- + The parameters selected are those that maximize the score of the held-out + data, according to the scoring parameter. + + If `n_jobs` was set to a value higher than one, the data is copied for each + parameter setting(and not `n_jobs` times). This is done for efficiency + reasons if individual jobs take very little time, but may raise errors if + the dataset is large and not enough memory is available. A workaround in + this case is to set `pre_dispatch`. Then, the memory is copied only + `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 * + n_jobs`. + + See Also + -------- + :class:`GridSearchCV`: + Does exhaustive search over a grid of parameters. + + :class:`ParameterSampler`: + A generator over parameter settins, constructed from + param_distributions. + + """ + + def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, + fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, + verbose=0, pre_dispatch='2*n_jobs', random_state=None): + + self.param_distributions = param_distributions + self.n_iter = n_iter + self.random_state = random_state + super(RandomizedSearchCV, self).__init__( + estimator=estimator, scoring=scoring, fit_params=fit_params, + n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, + pre_dispatch=pre_dispatch) + + def fit(self, X, y=None): + """Run fit on the estimator with randomly drawn parameters. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vector, where n_samples in the number of samples and + n_features is the number of features. + + y : array-like, shape = [n_samples] or [n_samples, n_output], optional + Target relative to X for classification or regression; + None for unsupervised learning. + + """ + sampled_params = ParameterSampler(self.param_distributions, + self.n_iter, + random_state=self.random_state) + return self._fit(X, y, sampled_params) diff --git a/sklearn/model_selection/utils.py b/sklearn/model_selection/utils.py new file mode 100644 index 0000000000000..b29b57329690a --- /dev/null +++ b/sklearn/model_selection/utils.py @@ -0,0 +1,222 @@ +""" +The :mod:`sklearn.model_selection.utils` module includes +""" +#TODO Complete docstring +from __future__ import print_function + +# Author: Alexandre Gramfort , +# Gael Varoquaux +# Andreas Mueller +# Olivier Grisel +# License: BSD 3 clause + +from collections import Mapping +from functools import partial, reduce +from itertools import product +import operator + +import numpy as np + +from sklearn.model_selection.validate import _fit_and_score +from sklearn.utils import check_random_state + +__all__ = ['ParameterGrid', 'fit_grid_point', 'ParameterSampler'] + + +class ParameterGrid(object): + """Grid of parameters with a discrete number of values for each. + + Can be used to iterate over parameter value combinations with the + Python built-in function iter. + + Parameters + ---------- + param_grid : dict of string to sequence, or sequence of such + The parameter grid to explore, as a dictionary mapping estimator + parameters to sequences of allowed values. + + An empty dict signifies default parameters. + + A sequence of dicts signifies a sequence of grids to search, and is + useful to avoid exploring parameter combinations that make no sense + or have no effect. See the examples below. + + Examples + -------- + >>> from sklearn.grid_search import ParameterGrid + >>> param_grid = {'a': [1, 2], 'b': [True, False]} + >>> list(ParameterGrid(param_grid)) == ( + ... [{'a': 1, 'b': True}, {'a': 1, 'b': False}, + ... {'a': 2, 'b': True}, {'a': 2, 'b': False}]) + True + + >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}] + >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'}, + ... {'kernel': 'rbf', 'gamma': 1}, + ... {'kernel': 'rbf', 'gamma': 10}] + True + + See also + -------- + :class:`GridSearchCV`: + uses ``ParameterGrid`` to perform a full parallelized parameter search. + """ + + def __init__(self, param_grid): + if isinstance(param_grid, Mapping): + # wrap dictionary in a singleton list to support either dict + # or list of dicts + param_grid = [param_grid] + self.param_grid = param_grid + + def __iter__(self): + """Iterate over the points in the grid. + + Returns + ------- + params : iterator over dict of string to any + Yields dictionaries mapping each estimator parameter to one of its + allowed values. + """ + for p in self.param_grid: + # Always sort the keys of a dictionary, for reproducibility + items = sorted(p.items()) + if not items: + yield {} + else: + keys, values = zip(*items) + for v in product(*values): + params = dict(zip(keys, v)) + yield params + + def __len__(self): + """Number of points on the grid.""" + # Product function that can handle iterables (np.product can't). + product = partial(reduce, operator.mul) + return sum(product(len(v) for v in p.values()) if p else 1 + for p in self.param_grid) + + +def fit_grid_point(X, y, estimator, parameters, train, test, scorer, + verbose, **fit_params): + """Run fit on one set of parameters. + + Parameters + ---------- + X : array-like, sparse matrix or list + Input data. + + y : array-like or None + Targets for input data. + + estimator : estimator object + This estimator will be cloned and then fitted. + + parameters : dict + Parameters to be set on estimator for this grid point. + + train : ndarray, dtype int or bool + Boolean mask or indices for training set. + + test : ndarray, dtype int or bool + Boolean mask or indices for test set. + + scorer : callable or None. + If provided must be a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + verbose : int + Verbosity level. + + **fit_params : kwargs + Additional parameter passed to the fit function of the estimator. + + + Returns + ------- + score : float + Score of this parameter setting on given training / test split. + + parameters : dict + The parameters that have been evaluated. + + n_samples_test : int + Number of test samples in this split. + """ + score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train, + test, verbose, parameters, + fit_params) + return score, parameters, n_samples_test + + +class ParameterSampler(object): + """Generator on parameters sampled from given distributions. + + Non-deterministic iterable over random candidate combinations for hyper- + parameter search. + + Note that as of SciPy 0.12, the ``scipy.stats.distributions`` do not accept + a custom RNG instance and always use the singleton RNG from + ``numpy.random``. Hence setting ``random_state`` will not guarantee a + deterministic iteration whenever ``scipy.stats`` distributions are used to + define the parameter search space. + + Parameters + ---------- + param_distributions : dict + Dictionary where the keys are parameters and values + are distributions from which a parameter is to be sampled. + Distributions either have to provide a ``rvs`` function + to sample from them, or can be given as a list of values, + where a uniform distribution is assumed. + + n_iter : integer + Number of parameter settings that are produced. + + random_state : int or RandomState + Pseudo random number generator state used for random uniform sampling + from lists of possible values instead of scipy.stats distributions. + + Returns + ------- + params : dict of string to any + **Yields** dictionaries mapping each estimator parameter to + as sampled value. + + Examples + -------- + >>> from sklearn.grid_search import ParameterSampler + >>> from scipy.stats.distributions import expon + >>> import numpy as np + >>> np.random.seed(0) + >>> param_grid = {'a':[1, 2], 'b': expon()} + >>> param_list = list(ParameterSampler(param_grid, n_iter=4)) + >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items()) + ... for d in param_list] + >>> rounded_list == [{'b': 0.89856, 'a': 1}, + ... {'b': 0.923223, 'a': 1}, + ... {'b': 1.878964, 'a': 2}, + ... {'b': 1.038159, 'a': 2}] + True + """ + def __init__(self, param_distributions, n_iter, random_state=None): + self.param_distributions = param_distributions + self.n_iter = n_iter + self.random_state = random_state + + def __iter__(self): + rnd = check_random_state(self.random_state) + # Always sort the keys of a dictionary, for reproducibility + items = sorted(self.param_distributions.items()) + for _ in range(self.n_iter): + params = dict() + for k, v in items: + if hasattr(v, "rvs"): + params[k] = v.rvs() + else: + params[k] = v[rnd.randint(len(v))] + yield params + + def __len__(self): + """Number of points that will be sampled.""" + return self.n_iter diff --git a/sklearn/model_selection/validate.py b/sklearn/model_selection/validate.py new file mode 100644 index 0000000000000..815064dfb6219 --- /dev/null +++ b/sklearn/model_selection/validate.py @@ -0,0 +1,652 @@ +""" +The :mod:`sklearn.model_selection.validate` module includes +""" +#TODO Complete docstring + +# Author: Alexandre Gramfort , +# Gael Varoquaux , +# Olivier Grisel +# Alexander Fabisch +# License: BSD 3 clause + +from __future__ import print_function +from __future__ import division + +import numbers +import time +import warnings + +import numpy as np + +from sklearn.base import is_classifier, clone +from sklearn.utils import check_arrays, check_random_state, safe_mask +from sklearn.utils.validation import _num_samples +from sklearn.externals.joblib import Parallel, delayed, logger +from sklearn.metrics.scorer import check_scoring +from sklearn.utils import check_arrays +from sklearn.utils.fixes import astype +from .partition import _check_cv + +def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, + verbose=0, fit_params=None, score_func=None, + pre_dispatch='2*n_jobs'): + """Evaluate a score by cross-validation + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : array-like + The data to fit. Can be, for example a list, or an array at least 2d. + + y : array-like, optional, default: None + The target variable to try to predict in the case of + supervised learning. + + scoring : string, callable or None, optional, default: None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + cv : cross-validation generator, optional, default: None + A cross-validation generator. If None, a 3-fold cross + validation is used or 3-fold stratified cross-validation + when y is supplied and estimator is a classifier. + + n_jobs : integer, optional + The number of CPUs to use to do the computation. -1 means + 'all CPUs'. + + verbose : integer, optional + The verbosity level. + + fit_params : dict, optional + Parameters to pass to the fit method of the estimator. + + pre_dispatch : int, or string, optional + Controls the number of jobs that get dispatched during parallel + execution. Reducing this number can be useful to avoid an + explosion of memory consumption when more jobs get dispatched + than CPUs can process. This parameter can be: + + - None, in which case all the jobs are immediately + created and spawned. Use this for lightweight and + fast-running jobs, to avoid delays due to on-demand + spawning of the jobs + + - An int, giving the exact number of total jobs that are + spawned + + - A string, giving an expression as a function of n_jobs, + as in '2*n_jobs' + + Returns + ------- + scores : array of float, shape=(len(list(cv)),) + Array of scores of the estimator for each run of the cross validation. + """ + X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True, + allow_nans=True, allow_nd=True) + if y is not None: + y = np.asarray(y) + + cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) + scorer = check_scoring(estimator, score_func=score_func, scoring=scoring) + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickle-able. + parallel = Parallel(n_jobs=n_jobs, verbose=verbose, + pre_dispatch=pre_dispatch) + scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, + train, test, verbose, None, + fit_params) + for train, test in cv) + return np.array(scores)[:, 0] + + +def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, + fit_params, return_train_score=False, + return_parameters=False): + """Fit estimator and compute scores for a given dataset split. + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : array-like of shape at least 2D + The data to fit. + + y : array-like, optional, default: None + The target variable to try to predict in the case of + supervised learning. + + scoring : callable + A scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + train : array-like, shape = (n_train_samples,) + Indices of training samples. + + test : array-like, shape = (n_test_samples,) + Indices of test samples. + + verbose : integer + The verbosity level. + + parameters : dict or None + Parameters to be set on the estimator. + + fit_params : dict or None + Parameters that will be passed to ``estimator.fit``. + + return_train_score : boolean, optional, default: False + Compute and return score on training set. + + return_parameters : boolean, optional, default: False + Return parameters that has been used for the estimator. + + Returns + ------- + train_score : float, optional + Score on training set, returned only if `return_train_score` is `True`. + + test_score : float + Score on test set. + + n_test_samples : int + Number of test samples. + + scoring_time : float + Time spent for fitting and scoring in seconds. + + parameters : dict or None, optional + The parameters that have been evaluated. + """ + if verbose > 1: + if parameters is None: + msg = "no parameters to be set" + else: + msg = '%s' % (', '.join('%s=%s' % (k, v) + for k, v in parameters.items())) + print("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) + + # Adjust lenght of sample weights + n_samples = _num_samples(X) + fit_params = fit_params if fit_params is not None else {} + fit_params = dict([(k, np.asarray(v)[train] + if hasattr(v, '__len__') and len(v) == n_samples else v) + for k, v in fit_params.items()]) + + if parameters is not None: + estimator.set_params(**parameters) + + start_time = time.time() + + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, y_test = _safe_split(estimator, X, y, test, train) + if y_train is None: + estimator.fit(X_train, **fit_params) + else: + estimator.fit(X_train, y_train, **fit_params) + test_score = _score(estimator, X_test, y_test, scorer) + if return_train_score: + train_score = _score(estimator, X_train, y_train, scorer) + + scoring_time = time.time() - start_time + + if verbose > 2: + msg += ", score=%f" % test_score + if verbose > 1: + end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time)) + print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) + + ret = [train_score] if return_train_score else [] + ret.extend([test_score, _num_samples(X_test), scoring_time]) + if return_parameters: + ret.append(parameters) + return ret + + +def _safe_split(estimator, X, y, indices, train_indices=None): + """Create subset of dataset and properly handle kernels.""" + if hasattr(estimator, 'kernel') and callable(estimator.kernel): + # cannot compute the kernel values with custom function + raise ValueError("Cannot use a custom kernel function. " + "Precompute the kernel matrix instead.") + + if not hasattr(X, "shape"): + if getattr(estimator, "_pairwise", False): + raise ValueError("Precomputed kernels or affinity matrices have " + "to be passed as arrays or sparse matrices.") + X_subset = [X[idx] for idx in indices] + else: + if getattr(estimator, "_pairwise", False): + # X is a precomputed square kernel matrix + if X.shape[0] != X.shape[1]: + raise ValueError("X should be a square kernel matrix") + if train_indices is None: + X_subset = X[np.ix_(indices, indices)] + else: + X_subset = X[np.ix_(indices, train_indices)] + else: + X_subset = X[safe_mask(X, indices)] + + if y is not None: + y_subset = y[safe_mask(y, indices)] + else: + y_subset = None + + return X_subset, y_subset + + +def _score(estimator, X_test, y_test, scorer): + """Compute the score of an estimator on a given test set.""" + if y_test is None: + score = scorer(estimator, X_test) + else: + score = scorer(estimator, X_test, y_test) + if not isinstance(score, numbers.Number): + raise ValueError("scoring must return a number, got %s (%s) instead." + % (str(score), type(score))) + return score + + +def _permutation_test_score(estimator, X, y, cv, scorer): + """Auxiliary function for permutation_test_score""" + avg_score = [] + for train, test in cv: + estimator.fit(X[train], y[train]) + avg_score.append(scorer(estimator, X[test], y[test])) + return np.mean(avg_score) + + +def _shuffle(y, labels, random_state): + """Return a shuffled copy of y eventually shuffle among same labels.""" + if labels is None: + ind = random_state.permutation(len(y)) + else: + ind = np.arange(len(labels)) + for label in np.unique(labels): + this_mask = (labels == label) + ind[this_mask] = random_state.permutation(ind[this_mask]) + return y[ind] + + +def permutation_test_score(estimator, X, y, score_func=None, cv=None, + n_permutations=100, n_jobs=1, labels=None, + random_state=0, verbose=0, scoring=None): + """Evaluate the significance of a cross-validated score with permutations + + Parameters + ---------- + estimator : estimator object implementing 'fit' + The object to use to fit the data. + + X : array-like of shape at least 2D + The data to fit. + + y : array-like + The target variable to try to predict in the case of + supervised learning. + + scoring : string, callable or None, optional, default: None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + cv : integer or cross-validation generator, optional + If an integer is passed, it is the number of fold (default 3). + Specific cross-validation objects can be passed, see + sklearn.cross_validation module for the list of possible objects. + + n_permutations : integer, optional + Number of times to permute ``y``. + + n_jobs : integer, optional + The number of CPUs to use to do the computation. -1 means + 'all CPUs'. + + labels : array-like of shape [n_samples] (optional) + Labels constrain the permutation among groups of samples with + a same label. + + random_state : RandomState or an int seed (0 by default) + A random number generator instance to define the state of the + random permutations generator. + + verbose : integer, optional + The verbosity level. + + Returns + ------- + score : float + The true score without permuting targets. + + permutation_scores : array, shape = [n_permutations] + The scores obtained for each permutations. + + pvalue : float + The returned value equals p-value if `score_func` returns bigger + numbers for better scores (e.g., accuracy_score). If `score_func` is + rather a loss function (i.e. when lower is better such as with + `mean_squared_error`) then this is actually the complement of the + p-value: 1 - p-value. + + Notes + ----- + This function implements Test 1 in: + + Ojala and Garriga. Permutation Tests for Studying Classifier + Performance. The Journal of Machine Learning Research (2010) + vol. 11 + + """ + X, y = check_arrays(X, y, sparse_format='csr', allow_nans=True) + cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) + scorer = check_scoring(estimator, scoring=scoring, score_func=score_func) + random_state = check_random_state(random_state) + + # We clone the estimator to make sure that all the folds are + # independent, and that it is pickle-able. + score = _permutation_test_score(clone(estimator), X, y, cv, scorer) + permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)( + delayed(_permutation_test_score)( + clone(estimator), X, _shuffle(y, labels, random_state), cv, + scorer) + for _ in range(n_permutations)) + permutation_scores = np.array(permutation_scores) + pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1) + return score, permutation_scores, pvalue + + +permutation_test_score.__test__ = False # to avoid a pb with nosetests + + +def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), + cv=None, scoring=None, exploit_incremental_learning=False, + n_jobs=1, pre_dispatch="all", verbose=0): + """Learning curve. + + Determines cross-validated training and test scores for different training + set sizes. + + A cross-validation generator splits the whole dataset k times in training + and test data. Subsets of the training set with varying sizes will be used + to train the estimator and a score for each training subset size and the + test set will be computed. Afterwards, the scores will be averaged over + all k runs for each training subset size. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + An object of that type which is cloned for each validation. + + X : array-like, shape (n_samples, n_features) + Training vector, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape (n_samples) or (n_samples, n_features), optional + Target relative to X for classification or regression; + None for unsupervised learning. + + train_sizes : array-like, shape (n_ticks,), dtype float or int + Relative or absolute numbers of training examples that will be used to + generate the learning curve. If the dtype is float, it is regarded as a + fraction of the maximum size of the training set (that is determined + by the selected validation method), i.e. it has to be within (0, 1]. + Otherwise it is interpreted as absolute sizes of the training sets. + Note that for classification the number of samples usually have to + be big enough to contain at least one sample from each class. + (default: np.linspace(0.1, 1.0, 5)) + + cv : integer, cross-validation generator, optional + If an integer is passed, it is the number of folds (defaults to 3). + Specific cross-validation objects can be passed, see + sklearn.cross_validation module for the list of possible objects + + scoring : string, callable or None, optional, default: None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + exploit_incremental_learning : boolean, optional, default: False + If the estimator supports incremental learning, this will be + used to speed up fitting for different training set sizes. + + n_jobs : integer, optional + Number of jobs to run in parallel (default 1). + + pre_dispatch : integer or string, optional + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The string can + be an expression like '2*n_jobs'. + + verbose : integer, optional + Controls the verbosity: the higher, the more messages. + + Returns + ------- + train_sizes_abs : array, shape = (n_unique_ticks,), dtype int + Numbers of training examples that has been used to generate the + learning curve. Note that the number of ticks might be less + than n_ticks because duplicate entries will be removed. + + train_scores : array, shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : array, shape (n_ticks, n_cv_folds) + Scores on test set. + + Notes + ----- + See :ref:`examples/plot_learning_curve.py ` + """ + if exploit_incremental_learning and not hasattr(estimator, "partial_fit"): + raise ValueError("An estimator must support the partial_fit interface " + "to exploit incremental learning") + + X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) + # Make a list since we will be iterating multiple times over the folds + cv = list(_check_cv(cv, X, y, classifier=is_classifier(estimator))) + scorer = check_scoring(estimator, scoring=scoring) + + # HACK as long as boolean indices are allowed in cv generators + if cv[0][0].dtype == bool: + new_cv = [] + for i in range(len(cv)): + new_cv.append((np.nonzero(cv[i][0])[0], np.nonzero(cv[i][1])[0])) + cv = new_cv + + n_max_training_samples = len(cv[0][0]) + # Because the lengths of folds can be significantly different, it is + # not guaranteed that we use all of the available training data when we + # use the first 'n_max_training_samples' samples. + train_sizes_abs = _translate_train_sizes(train_sizes, + n_max_training_samples) + n_unique_ticks = train_sizes_abs.shape[0] + if verbose > 0: + print("[learning_curve] Training set sizes: " + str(train_sizes_abs)) + + parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, + verbose=verbose) + if exploit_incremental_learning: + classes = np.unique(y) if is_classifier(estimator) else None + out = parallel(delayed(_incremental_fit_estimator)( + clone(estimator), X, y, classes, train, test, train_sizes_abs, + scorer, verbose) for train, test in cv) + else: + out = parallel(delayed(_fit_and_score)( + clone(estimator), X, y, scorer, train[:n_train_samples], test, + verbose, parameters=None, fit_params=None, return_train_score=True) + for train, test in cv for n_train_samples in train_sizes_abs) + out = np.array(out)[:, :2] + n_cv_folds = out.shape[0] // n_unique_ticks + out = out.reshape(n_cv_folds, n_unique_ticks, 2) + + out = np.asarray(out).transpose((2, 1, 0)) + + return train_sizes_abs, out[0], out[1] + + +def _translate_train_sizes(train_sizes, n_max_training_samples): + """Determine absolute sizes of training subsets and validate 'train_sizes'. + + Examples: + _translate_train_sizes([0.5, 1.0], 10) -> [5, 10] + _translate_train_sizes([5, 10], 10) -> [5, 10] + + Parameters + ---------- + train_sizes : array-like, shape (n_ticks,), dtype float or int + Numbers of training examples that will be used to generate the + learning curve. If the dtype is float, it is regarded as a + fraction of 'n_max_training_samples', i.e. it has to be within (0, 1]. + + n_max_training_samples : int + Maximum number of training samples (upper bound of 'train_sizes'). + + Returns + ------- + train_sizes_abs : array, shape (n_unique_ticks,), dtype int + Numbers of training examples that will be used to generate the + learning curve. Note that the number of ticks might be less + than n_ticks because duplicate entries will be removed. + """ + train_sizes_abs = np.asarray(train_sizes) + n_ticks = train_sizes_abs.shape[0] + n_min_required_samples = np.min(train_sizes_abs) + n_max_required_samples = np.max(train_sizes_abs) + if np.issubdtype(train_sizes_abs.dtype, np.float): + if n_min_required_samples <= 0.0 or n_max_required_samples > 1.0: + raise ValueError("train_sizes has been interpreted as fractions " + "of the maximum number of training samples and " + "must be within (0, 1], but is within [%f, %f]." + % (n_min_required_samples, + n_max_required_samples)) + train_sizes_abs = astype(train_sizes_abs * n_max_training_samples, + dtype=np.int, copy=False) + train_sizes_abs = np.clip(train_sizes_abs, 1, + n_max_training_samples) + else: + if (n_min_required_samples <= 0 or + n_max_required_samples > n_max_training_samples): + raise ValueError("train_sizes has been interpreted as absolute " + "numbers of training samples and must be within " + "(0, %d], but is within [%d, %d]." + % (n_max_training_samples, + n_min_required_samples, + n_max_required_samples)) + + train_sizes_abs = np.unique(train_sizes_abs) + if n_ticks > train_sizes_abs.shape[0]: + warnings.warn("Removed duplicate entries from 'train_sizes'. Number " + "of ticks will be less than than the size of " + "'train_sizes' %d instead of %d)." + % (train_sizes_abs.shape[0], n_ticks), RuntimeWarning) + + return train_sizes_abs + + +def _incremental_fit_estimator(estimator, X, y, classes, train, test, + train_sizes, scorer, verbose): + """Train estimator on training subsets incrementally and compute scores.""" + train_scores, test_scores = [], [] + partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) + for n_train_samples, partial_train in partitions: + train_subset = train[:n_train_samples] + X_train, y_train = _safe_split(estimator, X, y, train_subset) + X_partial_train, y_partial_train = _safe_split(estimator, X, y, + partial_train) + X_test, y_test = _safe_split(estimator, X, y, test, train_subset) + if y_partial_train is None: + estimator.partial_fit(X_partial_train, classes=classes) + else: + estimator.partial_fit(X_partial_train, y_partial_train, + classes=classes) + train_scores.append(_score(estimator, X_train, y_train, scorer)) + test_scores.append(_score(estimator, X_test, y_test, scorer)) + return np.array((train_scores, test_scores)).T + + +def validation_curve(estimator, X, y, param_name, param_range, cv=None, + scoring=None, n_jobs=1, pre_dispatch="all", verbose=0): + """Validation curve. + + Determine training and test scores for varying parameter values. + + Compute scores for an estimator with different values of a specified + parameter. This is similar to grid search with one parameter. However, this + will also compute training scores and is merely a utility for plotting the + results. + + Parameters + ---------- + estimator : object type that implements the "fit" and "predict" methods + An object of that type which is cloned for each validation. + + X : array-like, shape (n_samples, n_features) + Training vector, where n_samples is the number of samples and + n_features is the number of features. + + y : array-like, shape (n_samples) or (n_samples, n_features), optional + Target relative to X for classification or regression; + None for unsupervised learning. + + param_name : string + Name of the parameter that will be varied. + + param_range : array-like, shape (n_values,) + The values of the parameter that will be evaluated. + + cv : integer, cross-validation generator, optional + If an integer is passed, it is the number of folds (defaults to 3). + Specific cross-validation objects can be passed, see + sklearn.cross_validation module for the list of possible objects + + scoring : string, callable or None, optional, default: None + A string (see model evaluation documentation) or + a scorer callable object / function with signature + ``scorer(estimator, X, y)``. + + n_jobs : integer, optional + Number of jobs to run in parallel (default 1). + + pre_dispatch : integer or string, optional + Number of predispatched jobs for parallel execution (default is + all). The option can reduce the allocated memory. The string can + be an expression like '2*n_jobs'. + + verbose : integer, optional + Controls the verbosity: the higher, the more messages. + + Returns + ------- + train_scores : array, shape (n_ticks, n_cv_folds) + Scores on training sets. + + test_scores : array, shape (n_ticks, n_cv_folds) + Scores on test set. + + Notes + ----- + See + :ref:`examples/plot_validation_curve.py ` + """ + X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True) + cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) + scorer = check_scoring(estimator, scoring=scoring) + + parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, + verbose=verbose) + out = parallel(delayed(_fit_and_score)( + estimator, X, y, scorer, train, test, verbose, + parameters={param_name: v}, fit_params=None, return_train_score=True) + for train, test in cv for v in param_range) + + out = np.asarray(out)[:, :2] + n_params = len(param_range) + n_cv_folds = out.shape[0] // n_params + out = out.reshape(n_cv_folds, n_params, 2).transpose((2, 1, 0)) + + return out[0], out[1] diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index 8a27d9c5a7872..73bdae2641442 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -19,6 +19,7 @@ from sklearn.utils.testing import ignore_warnings from sklearn import cross_validation as cval +from sklearn.model_selection.validate import _safe_split from sklearn.base import BaseEstimator from sklearn.datasets import make_regression from sklearn.datasets import load_digits @@ -885,12 +886,12 @@ def test_safe_split_with_precomputed_kernel(): cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0) tr, te = list(cv)[0] - X_tr, y_tr = cval._safe_split(clf, X, y, tr) - K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr) + X_tr, y_tr = _safe_split(clf, X, y, tr) + K_tr, y_tr2 = _safe_split(clfp, K, y, tr) assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) - X_te, y_te = cval._safe_split(clf, X, y, te, tr) - K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr) + X_te, y_te = _safe_split(clf, X, y, te, tr) + K_te, y_te2 = _safe_split(clfp, K, y, te, tr) assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))