From e313f81c93ac1d6d913ea7cad0827d9f08eea9c3 Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Sun, 6 Apr 2014 15:08:37 -0700 Subject: [PATCH 1/5] grid_search: add sample_weight support --- sklearn/cross_validation.py | 31 ++++++++++++---- sklearn/grid_search.py | 49 ++++++++++++++++++-------- sklearn/tests/test_cross_validation.py | 8 ++--- sklearn/tests/test_grid_search.py | 27 ++++++++++++-- 4 files changed, 87 insertions(+), 28 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index ebcf4f934f043..bb0d7cc85c226 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1150,7 +1150,8 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, return np.array(scores)[:, 0] -def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, +def _fit_and_score(estimator, X, y, sample_weight, + scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False): """Fit estimator and compute scores for a given dataset split. @@ -1163,10 +1164,13 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, X : array-like of shape at least 2D The data to fit. - y : array-like, optional, default: None + y : array-like or None The target variable to try to predict in the case of supervised learning. + sample_weight : array-like or None + Sample weights. + scoring : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. @@ -1231,13 +1235,26 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) + + test_score_params = dict() + train_score_params = dict() + if sample_weight is not None: + # move to _safe_split? + sample_weight_train = sample_weight[safe_mask(sample_weight, train)] + sample_weight_test = sample_weight[safe_mask(sample_weight, test)] + fit_params['sample_weight'] = sample_weight_train + test_score_params['sample_weight'] = sample_weight_test + train_score_params['sample_weight'] = sample_weight_train + if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) - test_score = _score(estimator, X_test, y_test, scorer) + test_score = _score(estimator, X_test, y_test, scorer, + **test_score_params) if return_train_score: - train_score = _score(estimator, X_train, y_train, scorer) + train_score = _score(estimator, X_train, y_train, scorer, + **train_score_params) scoring_time = time.time() - start_time @@ -1286,12 +1303,12 @@ def _safe_split(estimator, X, y, indices, train_indices=None): return X_subset, y_subset -def _score(estimator, X_test, y_test, scorer): +def _score(estimator, X_test, y_test, scorer, **params): """Compute the score of an estimator on a given test set.""" if y_test is None: - score = scorer(estimator, X_test) + score = scorer(estimator, X_test, **params) else: - score = scorer(estimator, X_test, y_test) + score = scorer(estimator, X_test, y_test, **params) if not isinstance(score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s) instead." % (str(score), type(score))) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 280dbb32b1e54..65c32fef6109b 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -8,6 +8,7 @@ # Gael Varoquaux # Andreas Mueller # Olivier Grisel +# Noel Dawe # License: BSD 3 clause from abc import ABCMeta, abstractmethod @@ -226,7 +227,8 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, n_samples_test : int Number of test samples in this split. """ - score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train, + score, n_samples_test, _ = _fit_and_score(estimator, X, y, None, + scorer, train, test, verbose, parameters, fit_params) return score, parameters, n_samples_test @@ -291,7 +293,7 @@ def __init__(self, estimator, scoring=None, self.verbose = verbose self.pre_dispatch = pre_dispatch - def score(self, X, y=None): + def score(self, X, y=None, sample_weight=None): """Returns the score on the given test data and labels, if the search estimator has been refit. The ``score`` function of the best estimator is used, or the ``scoring`` parameter where unavailable. @@ -306,18 +308,24 @@ def score(self, X, y=None): Target relative to X for classification or regression; None for unsupervised learning. + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + Returns ------- score : float """ + kwargs = {} + if sample_weight is not None: + kwargs['sample_weight'] = sample_weight if hasattr(self.best_estimator_, 'score'): - return self.best_estimator_.score(X, y) + return self.best_estimator_.score(X, y, **kwargs) if self.scorer_ is None: raise ValueError("No score function explicitly defined, " "and the estimator doesn't provide one %s" % self.best_estimator_) - return self.scorer_(self.best_estimator_, X, y) + return self.scorer_(self.best_estimator_, X, y, **kwargs) @property def predict(self): @@ -335,7 +343,7 @@ def decision_function(self): def transform(self): return self.best_estimator_.transform - def _fit(self, X, y, parameter_iterable): + def _fit(self, X, y, sample_weight, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator @@ -343,13 +351,14 @@ def _fit(self, X, y, parameter_iterable): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) - X, y = indexable(X, y) + X, y, sample_weight = indexable(X, y, sample_weight) if y is not None: if len(y) != n_samples: raise ValueError('Target variable (y) has a different number ' 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) + cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: @@ -367,9 +376,10 @@ def _fit(self, X, y, parameter_iterable): n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( - delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, - train, test, self.verbose, parameters, - self.fit_params, return_parameters=True) + delayed(_fit_and_score)(clone(base_estimator), X, y, sample_weight, + self.scorer_, train, test, + self.verbose, parameters, self.fit_params, + return_parameters=True) for parameters in parameter_iterable for train, test in cv) @@ -411,14 +421,18 @@ def _fit(self, X, y, parameter_iterable): self.best_score_ = best.mean_validation_score if self.refit: + fit_params = self.fit_params + if sample_weight is not None: + fit_params = fit_params.copy() + fit_params['sample_weight'] = sample_weight # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: - best_estimator.fit(X, y, **self.fit_params) + best_estimator.fit(X, y, **fit_params) else: - best_estimator.fit(X, **self.fit_params) + best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self @@ -573,7 +587,7 @@ def __init__(self, estimator, param_grid, scoring=None, self.param_grid = param_grid _check_param_grid(param_grid) - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """Run fit with all sets of parameters. Parameters @@ -587,8 +601,10 @@ def fit(self, X, y=None): Target relative to X for classification or regression; None for unsupervised learning. + sample_weight : array-like, shape = [n_samples], optional + Sample weights. """ - return self._fit(X, y, ParameterGrid(self.param_grid)) + return self._fit(X, y, sample_weight, ParameterGrid(self.param_grid)) class RandomizedSearchCV(BaseSearchCV): @@ -724,7 +740,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch) - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """Run fit on the estimator with randomly drawn parameters. Parameters @@ -737,8 +753,11 @@ def fit(self, X, y=None): Target relative to X for classification or regression; None for unsupervised learning. + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + """ sampled_params = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) - return self._fit(X, y, sampled_params) + return self._fit(X, y, sample_weight, sampled_params) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index e3af30a1b2bae..a495871290ce4 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -914,12 +914,12 @@ def test_safe_split_with_precomputed_kernel(): cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0) tr, te = list(cv)[0] - X_tr, y_tr = cval._safe_split(clf, X, y, tr) - K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr) + X_tr, y_tr, _ = cval._safe_split(clf, X, y, None, tr) + K_tr, y_tr2, _ = cval._safe_split(clfp, K, y, None, tr) assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) - X_te, y_te = cval._safe_split(clf, X, y, te, tr) - K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr) + X_te, y_te, _ = cval._safe_split(clf, X, y, None, te, tr) + K_te, y_te2, _ = cval._safe_split(clfp, K, y, None, te, tr) assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T)) diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 02183d18cd2fc..b1573abbb4fb7 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -51,8 +51,13 @@ class MockClassifier(object): def __init__(self, foo_param=0): self.foo_param = foo_param - def fit(self, X, Y): + def fit(self, X, Y, sample_weight=None): assert_true(len(X) == len(Y)) + if sample_weight is not None: + assert_true(len(sample_weight) == len(X), + 'MockClassifier sample_weight.shape[0]' + ' is {0}, should be {1}'.format(len(sample_weight), + len(X))) return self def predict(self, T): @@ -62,7 +67,12 @@ def predict(self, T): decision_function = predict transform = predict - def score(self, X=None, Y=None): + def score(self, X=None, Y=None, sample_weight=None): + if X is not None and sample_weight is not None: + assert_true(len(sample_weight) == len(X), + 'MockClassifier sample_weight.shape[0]' + ' is {0}, should be {1}'.format(len(sample_weight), + len(X))) if self.foo_param > 1: score = 1. else: @@ -85,6 +95,7 @@ def score(self): X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) y = np.array([1, 1, 2, 2]) +sample_weight = np.array([1, 2, 3, 4]) def test_parameter_grid(): @@ -638,3 +649,15 @@ def test_grid_search_allows_nans(): ('classifier', MockClassifier()), ]) GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y) + + +def test_grid_search_with_sample_weights(): + """Test grid searching with sample weights""" + est_parameters = {"foo_param": [1, 2, 3]} + cv = KFold(y.shape[0], n_folds=2, random_state=0) + for search_cls in (GridSearchCV, RandomizedSearchCV): + grid_search = search_cls(MockClassifier(), est_parameters, cv=cv) + grid_search.fit(X, y, sample_weight=sample_weight) + # check that sample_weight can be a list + grid_search = GridSearchCV(MockClassifier(), est_parameters, cv=cv) + grid_search.fit(X, y, sample_weight=sample_weight.tolist()) From 5816618981e334fd58fc49b44e1f5e054f137f59 Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Mon, 21 Apr 2014 18:53:08 -0700 Subject: [PATCH 2/5] cross_validation: add sample_weight support --- sklearn/cross_validation.py | 34 +++++++++++++++++--------- sklearn/tests/test_cross_validation.py | 16 ++++++++---- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index bb0d7cc85c226..64ed338ffdc2b 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1077,7 +1077,8 @@ def __len__(self): def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, - verbose=0, fit_params=None, pre_dispatch='2*n_jobs'): + verbose=0, fit_params=None, pre_dispatch='2*n_jobs', + sample_weight=None): """Evaluate a score by cross-validation Parameters @@ -1092,6 +1093,9 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, The target variable to try to predict in the case of supervised learning. + sample_weight : array-like, optional, default: None + Sample weights. + scoring : string, callable or None, optional, default: None A string (see model evaluation documentation) or a scorer callable object / function with signature @@ -1135,7 +1139,7 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, scores : array of float, shape=(len(list(cv)),) Array of scores of the estimator for each run of the cross validation. """ - X, y = indexable(X, y) + X, y, sample_weight = indexable(X, y, sample_weight) cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) @@ -1143,7 +1147,8 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) - scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, + scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, + sample_weight, scorer, train, test, verbose, None, fit_params) for train, test in cv) @@ -1233,15 +1238,15 @@ def _fit_and_score(estimator, X, y, sample_weight, start_time = time.time() - X_train, y_train = _safe_split(estimator, X, y, train) - X_test, y_test = _safe_split(estimator, X, y, test, train) + X_train, y_train, sample_weight_train = _safe_split( + estimator, X, y, sample_weight, train) + X_test, y_test, sample_weight_test = _safe_split( + estimator, X, y, sample_weight, test, train) - test_score_params = dict() - train_score_params = dict() + test_score_params = {} + train_score_params = {} if sample_weight is not None: - # move to _safe_split? - sample_weight_train = sample_weight[safe_mask(sample_weight, train)] - sample_weight_test = sample_weight[safe_mask(sample_weight, test)] + fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight_train test_score_params['sample_weight'] = sample_weight_test train_score_params['sample_weight'] = sample_weight_train @@ -1271,7 +1276,7 @@ def _fit_and_score(estimator, X, y, sample_weight, return ret -def _safe_split(estimator, X, y, indices, train_indices=None): +def _safe_split(estimator, X, y, sample_weight, indices, train_indices=None): """Create subset of dataset and properly handle kernels.""" if hasattr(estimator, 'kernel') and callable(estimator.kernel): # cannot compute the kernel values with custom function @@ -1300,7 +1305,12 @@ def _safe_split(estimator, X, y, indices, train_indices=None): else: y_subset = None - return X_subset, y_subset + if sample_weight is not None: + sample_weight_subset = np.asarray(sample_weight)[indices] + else: + sample_weight_subset = None + + return X_subset, y_subset, sample_weight_subset def _score(estimator, X_test, y_test, scorer, **params): diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index a495871290ce4..bdb791ad080f7 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -54,9 +54,9 @@ def fit(self, X, Y=None, sample_weight=None, class_prior=None): if X.ndim >= 3 and not self.allow_nd: raise ValueError('X cannot be d') if sample_weight is not None: - assert_true(sample_weight.shape[0] == X.shape[0], + assert_true(len(sample_weight) == X.shape[0], 'MockClassifier extra fit_param sample_weight.shape[0]' - ' is {0}, should be {1}'.format(sample_weight.shape[0], + ' is {0}, should be {1}'.format(len(sample_weight), X.shape[0])) if class_prior is not None: assert_true(class_prior.shape[0] == len(np.unique(y)), @@ -70,13 +70,15 @@ def predict(self, T): T = T.reshape(len(T), -1) return T.shape[0] - def score(self, X=None, Y=None): + def score(self, X=None, Y=None, sample_weight=None): return 1. / (1 + np.abs(self.a)) X = np.ones((10, 2)) X_sparse = coo_matrix(X) y = np.arange(10) // 2 +rng = np.random.RandomState(0) +int_weights = rng.randint(10, size=y.shape) ############################################################################## # Tests @@ -466,8 +468,8 @@ def test_cross_val_score(): for a in range(-10, 10): clf.a = a # Smoke test - scores = cval.cross_val_score(clf, X, y) - assert_array_equal(scores, clf.score(X, y)) + scores = cval.cross_val_score(clf, X, y, sample_weight=int_weights) + assert_array_equal(scores, clf.score(X, y, sample_weight=int_weights)) # test with multioutput y scores = cval.cross_val_score(clf, X_sparse, X) @@ -480,6 +482,10 @@ def test_cross_val_score(): scores = cval.cross_val_score(clf, X_sparse, X) assert_array_equal(scores, clf.score(X_sparse, X)) + # test with sample_weight as list + scores = cval.cross_val_score( + clf, X, y, sample_weight=int_weights.tolist()) + # test with X and y as list list_check = lambda x: isinstance(x, list) clf = CheckingClassifier(check_X=list_check) From 1e5c53687bfa9e0cc5b6ea7ba1b94b6ad60b0dbf Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Mon, 21 Apr 2014 18:31:40 -0700 Subject: [PATCH 3/5] rfe: sample_weight support --- sklearn/feature_selection/rfe.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 86c56e1f3264a..c3fd283b1ca34 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -306,7 +306,7 @@ def __init__(self, estimator, step=1, cv=None, scoring=None, self.estimator_params = estimator_params self.verbose = verbose - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """Fit the RFE model and automatically tune the number of selected features. @@ -319,6 +319,9 @@ def fit(self, X, y): y : array-like, shape = [n_samples] Target values (integers for classification, real numbers for regression). + + sample_weight : array-like, shape = [n_samples], optional (default=None) + Sample weights. """ X, y = check_X_y(X, y, "csr") # Initialization @@ -332,17 +335,26 @@ def fit(self, X, y): # Cross-validation for n, (train, test) in enumerate(cv): - X_train, y_train = _safe_split(self.estimator, X, y, train) - X_test, y_test = _safe_split(self.estimator, X, y, test, train) + X_train, y_train, sample_weight_train = _safe_split( + self.estimator, X, y, sample_weight, train) + X_test, y_test, sample_weight_test = _safe_split( + self.estimator, X, y, sample_weight, test, train) + + fit_params = dict() + score_params = dict() + if sample_weight is not None: + fit_params['sample_weight'] = sample_weight_train + score_params['sample_weight'] = sample_weight_test # Compute a full ranking of the features - ranking_ = rfe.fit(X_train, y_train).ranking_ + ranking_ = rfe.fit(X_train, y_train, **fit_params).ranking_ # Score each subset of features for k in range(0, max(ranking_)): mask = np.where(ranking_ <= k + 1)[0] estimator = clone(self.estimator) - estimator.fit(X_train[:, mask], y_train) - score = _score(estimator, X_test[:, mask], y_test, scorer) + estimator.fit(X_train[:, mask], y_train, **fit_params) + score = _score( + estimator, X_test[:, mask], y_test, scorer, **score_params) if self.verbose > 0: print("Finished fold with %d / %d feature ranks, score=%f" @@ -358,7 +370,10 @@ def fit(self, X, y): n_features_to_select=k+1, step=self.step, estimator_params=self.estimator_params) - rfe.fit(X, y) + if sample_weight is not None: + rfe.fit(X, y, sample_weight=sample_weight) + else: + rfe.fit(X, y) # Set final attributes self.support_ = rfe.support_ From 7612a1fe5b19a70d116959c79692b5b37366ac98 Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Mon, 21 Apr 2014 18:32:01 -0700 Subject: [PATCH 4/5] learning_curve: sample_weight support --- sklearn/learning_curve.py | 55 ++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 55c4cf6547d86..835a4ec9ac398 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -17,7 +17,8 @@ from .utils.fixes import astype -def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), +def learning_curve(estimator, X, y, sample_weight=None, + train_sizes=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=1, pre_dispatch="all", verbose=0): """Learning curve. @@ -44,6 +45,9 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), Target relative to X for classification or regression; None for unsupervised learning. + sample_weight : array-like, shape (n_samples), optional + Sample weights. + train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a @@ -128,11 +132,13 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), if exploit_incremental_learning: classes = np.unique(y) if is_classifier(estimator) else None out = parallel(delayed(_incremental_fit_estimator)( - clone(estimator), X, y, classes, train, test, train_sizes_abs, + clone(estimator), X, y, sample_weight, + classes, train, test, train_sizes_abs, scorer, verbose) for train, test in cv) else: out = parallel(delayed(_fit_and_score)( - clone(estimator), X, y, scorer, train[:n_train_samples], test, + clone(estimator), X, y, sample_weight, + scorer, train[:n_train_samples], test, verbose, parameters=None, fit_params=None, return_train_score=True) for train, test in cv for n_train_samples in train_sizes_abs) out = np.array(out)[:, :2] @@ -203,29 +209,45 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): return train_sizes_abs -def _incremental_fit_estimator(estimator, X, y, classes, train, test, +def _incremental_fit_estimator(estimator, X, y, sample_weight, + classes, train, test, train_sizes, scorer, verbose): """Train estimator on training subsets incrementally and compute scores.""" train_scores, test_scores = [], [] partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) for n_train_samples, partial_train in partitions: train_subset = train[:n_train_samples] - X_train, y_train = _safe_split(estimator, X, y, train_subset) - X_partial_train, y_partial_train = _safe_split(estimator, X, y, - partial_train) - X_test, y_test = _safe_split(estimator, X, y, test, train_subset) + X_train, y_train, sample_weight_train = _safe_split( + estimator, X, y, sample_weight, train_subset) + X_partial_train, y_partial_train, sample_weight_partial_train = \ + _safe_split(estimator, X, y, sample_weight, partial_train) + X_test, y_test, sample_weight_test = _safe_split( + estimator, X, y, sample_weight, test, train_subset) + + fit_params = dict() + train_score_params = dict() + test_score_params = dict() + if sample_weight is not None: + fit_params['sample_weight'] = sample_weight_partial_train + train_score_params['sample_weight'] = sample_weight_train + test_score_params['sample_weight'] = sample_weight_test + if y_partial_train is None: - estimator.partial_fit(X_partial_train, classes=classes) + estimator.partial_fit(X_partial_train, + classes=classes, **fit_params) else: estimator.partial_fit(X_partial_train, y_partial_train, - classes=classes) - train_scores.append(_score(estimator, X_train, y_train, scorer)) - test_scores.append(_score(estimator, X_test, y_test, scorer)) + classes=classes, **fit_params) + train_scores.append(_score( + estimator, X_train, y_train, scorer, **train_score_params)) + test_scores.append(_score( + estimator, X_test, y_test, scorer, **test_score_params)) return np.array((train_scores, test_scores)).T -def validation_curve(estimator, X, y, param_name, param_range, cv=None, - scoring=None, n_jobs=1, pre_dispatch="all", verbose=0): +def validation_curve(estimator, X, y, param_name, param_range, + sample_weight=None, cv=None, scoring=None, + n_jobs=1, pre_dispatch="all", verbose=0): """Validation curve. Determine training and test scores for varying parameter values. @@ -254,6 +276,9 @@ def validation_curve(estimator, X, y, param_name, param_range, cv=None, param_range : array-like, shape (n_values,) The values of the parameter that will be evaluated. + sample_weight : array-like, shape (n_samples,), optional + Sample weights. + cv : integer, cross-validation generator, optional If an integer is passed, it is the number of folds (defaults to 3). Specific cross-validation objects can be passed, see @@ -296,7 +321,7 @@ def validation_curve(estimator, X, y, param_name, param_range, cv=None, parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) out = parallel(delayed(_fit_and_score)( - estimator, X, y, scorer, train, test, verbose, + estimator, X, y, sample_weight, scorer, train, test, verbose, parameters={param_name: v}, fit_params=None, return_train_score=True) for train, test in cv for v in param_range) From f1f6a3cb9beab85e1f108a463f8a9fea6af205f6 Mon Sep 17 00:00:00 2001 From: Vlad Niculae Date: Fri, 1 Aug 2014 16:11:30 +0200 Subject: [PATCH 5/5] Refactor sample_weights as generic scorer_params --- sklearn/cross_validation.py | 67 ++++++++++++-------------- sklearn/feature_selection/rfe.py | 13 ++--- sklearn/grid_search.py | 52 ++++++++------------ sklearn/learning_curve.py | 48 ++++++++++++------ sklearn/tests/test_cross_validation.py | 15 +++--- sklearn/tests/test_grid_search.py | 12 +++-- sklearn/tests/test_learning_curve.py | 4 +- 7 files changed, 110 insertions(+), 101 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 64ed338ffdc2b..22e370de35577 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1078,7 +1078,7 @@ def __len__(self): def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', - sample_weight=None): + scorer_params=None): """Evaluate a score by cross-validation Parameters @@ -1093,9 +1093,6 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, The target variable to try to predict in the case of supervised learning. - sample_weight : array-like, optional, default: None - Sample weights. - scoring : string, callable or None, optional, default: None A string (see model evaluation documentation) or a scorer callable object / function with signature @@ -1134,12 +1131,16 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, - A string, giving an expression as a function of n_jobs, as in '2*n_jobs' + scorer_params : dict, optional + Parameters to pass to the scorer. Can be used for sample weights + and sample groups. + Returns ------- scores : array of float, shape=(len(list(cv)),) Array of scores of the estimator for each run of the cross validation. """ - X, y, sample_weight = indexable(X, y, sample_weight) + X, y = indexable(X, y) cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, scoring=scoring) @@ -1148,16 +1149,14 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, - sample_weight, scorer, - train, test, verbose, None, - fit_params) + scorer, train, test, verbose, + None, fit_params, scorer_params) for train, test in cv) return np.array(scores)[:, 0] -def _fit_and_score(estimator, X, y, sample_weight, - scorer, train, test, verbose, parameters, - fit_params, return_train_score=False, +def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, + fit_params, scorer_params, return_train_score=False, return_parameters=False): """Fit estimator and compute scores for a given dataset split. @@ -1173,9 +1172,6 @@ def _fit_and_score(estimator, X, y, sample_weight, The target variable to try to predict in the case of supervised learning. - sample_weight : array-like or None - Sample weights. - scoring : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. @@ -1195,6 +1191,9 @@ def _fit_and_score(estimator, X, y, sample_weight, fit_params : dict or None Parameters that will be passed to ``estimator.fit``. + scorer_params : dict or None + Parameters that will be passed to the scorer. + return_train_score : boolean, optional, default: False Compute and return score on training set. @@ -1233,33 +1232,36 @@ def _fit_and_score(estimator, X, y, sample_weight, if hasattr(v, '__len__') and len(v) == n_samples else v) for k, v in fit_params.items()]) + # Same, but take both slices + scorer_params = scorer_params if scorer_params is not None else {} + train_scorer_params = dict([(k, np.asarray(v)[train] + if hasattr(v, '__len__') + and len(v) == n_samples + else v) + for k, v in scorer_params.items()]) + test_scorer_params = dict([(k, np.asarray(v)[test] + if hasattr(v, '__len__') + and len(v) == n_samples + else v) + for k, v in scorer_params.items()]) + if parameters is not None: estimator.set_params(**parameters) start_time = time.time() - X_train, y_train, sample_weight_train = _safe_split( - estimator, X, y, sample_weight, train) - X_test, y_test, sample_weight_test = _safe_split( - estimator, X, y, sample_weight, test, train) - - test_score_params = {} - train_score_params = {} - if sample_weight is not None: - fit_params = fit_params.copy() - fit_params['sample_weight'] = sample_weight_train - test_score_params['sample_weight'] = sample_weight_test - train_score_params['sample_weight'] = sample_weight_train + X_train, y_train = _safe_split(estimator, X, y, train) + X_test, y_test = _safe_split(estimator, X, y, test, train) if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) test_score = _score(estimator, X_test, y_test, scorer, - **test_score_params) + **test_scorer_params) if return_train_score: train_score = _score(estimator, X_train, y_train, scorer, - **train_score_params) + **train_scorer_params) scoring_time = time.time() - start_time @@ -1276,7 +1278,7 @@ def _fit_and_score(estimator, X, y, sample_weight, return ret -def _safe_split(estimator, X, y, sample_weight, indices, train_indices=None): +def _safe_split(estimator, X, y, indices, train_indices=None): """Create subset of dataset and properly handle kernels.""" if hasattr(estimator, 'kernel') and callable(estimator.kernel): # cannot compute the kernel values with custom function @@ -1305,12 +1307,7 @@ def _safe_split(estimator, X, y, sample_weight, indices, train_indices=None): else: y_subset = None - if sample_weight is not None: - sample_weight_subset = np.asarray(sample_weight)[indices] - else: - sample_weight_subset = None - - return X_subset, y_subset, sample_weight_subset + return X_subset, y_subset def _score(estimator, X_test, y_test, scorer, **params): diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index c3fd283b1ca34..abe3caf370a26 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -335,16 +335,17 @@ def fit(self, X, y, sample_weight=None): # Cross-validation for n, (train, test) in enumerate(cv): - X_train, y_train, sample_weight_train = _safe_split( - self.estimator, X, y, sample_weight, train) - X_test, y_test, sample_weight_test = _safe_split( - self.estimator, X, y, sample_weight, test, train) + X_train, y_train = _safe_split( + self.estimator, X, y, train) + X_test, y_test = _safe_split( + self.estimator, X, y, test, train) fit_params = dict() score_params = dict() if sample_weight is not None: - fit_params['sample_weight'] = sample_weight_train - score_params['sample_weight'] = sample_weight_test + sample_weight = np.asarray(sample_weight) + fit_params['sample_weight'] = sample_weight[train] + score_params['sample_weight'] = sample_weight[test] # Compute a full ranking of the features ranking_ = rfe.fit(X_train, y_train, **fit_params).ranking_ diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index 65c32fef6109b..4d7cb28816762 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -281,7 +281,8 @@ class BaseSearchCV(six.with_metaclass(ABCMeta, BaseEstimator, @abstractmethod def __init__(self, estimator, scoring=None, fit_params=None, n_jobs=1, iid=True, - refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'): + refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', + scorer_params=None): self.scoring = scoring self.estimator = estimator @@ -292,8 +293,9 @@ def __init__(self, estimator, scoring=None, self.cv = cv self.verbose = verbose self.pre_dispatch = pre_dispatch + self.scorer_params = scorer_params - def score(self, X, y=None, sample_weight=None): + def score(self, X, y=None, **scorer_params): """Returns the score on the given test data and labels, if the search estimator has been refit. The ``score`` function of the best estimator is used, or the ``scoring`` parameter where unavailable. @@ -308,24 +310,18 @@ def score(self, X, y=None, sample_weight=None): Target relative to X for classification or regression; None for unsupervised learning. - sample_weight : array-like, shape = [n_samples], optional - Sample weights. - Returns ------- score : float """ - kwargs = {} - if sample_weight is not None: - kwargs['sample_weight'] = sample_weight if hasattr(self.best_estimator_, 'score'): - return self.best_estimator_.score(X, y, **kwargs) + return self.best_estimator_.score(X, y, **scorer_params) if self.scorer_ is None: raise ValueError("No score function explicitly defined, " "and the estimator doesn't provide one %s" % self.best_estimator_) - return self.scorer_(self.best_estimator_, X, y, **kwargs) + return self.scorer_(self.best_estimator_, X, y, **scorer_params) @property def predict(self): @@ -343,7 +339,7 @@ def decision_function(self): def transform(self): return self.best_estimator_.transform - def _fit(self, X, y, sample_weight, parameter_iterable): + def _fit(self, X, y, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator @@ -351,7 +347,7 @@ def _fit(self, X, y, sample_weight, parameter_iterable): self.scorer_ = check_scoring(self.estimator, scoring=self.scoring) n_samples = _num_samples(X) - X, y, sample_weight = indexable(X, y, sample_weight) + X, y = indexable(X, y) if y is not None: if len(y) != n_samples: @@ -376,10 +372,10 @@ def _fit(self, X, y, sample_weight, parameter_iterable): n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( - delayed(_fit_and_score)(clone(base_estimator), X, y, sample_weight, + delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, train, test, self.verbose, parameters, self.fit_params, - return_parameters=True) + self.scorer_params, return_parameters=True) for parameters in parameter_iterable for train, test in cv) @@ -422,9 +418,6 @@ def _fit(self, X, y, sample_weight, parameter_iterable): if self.refit: fit_params = self.fit_params - if sample_weight is not None: - fit_params = fit_params.copy() - fit_params['sample_weight'] = sample_weight # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( @@ -580,14 +573,15 @@ class GridSearchCV(BaseSearchCV): def __init__(self, estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, - refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs'): + refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', + scorer_params=None): super(GridSearchCV, self).__init__( estimator, scoring, fit_params, n_jobs, iid, - refit, cv, verbose, pre_dispatch) + refit, cv, verbose, pre_dispatch, scorer_params) self.param_grid = param_grid _check_param_grid(param_grid) - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None): """Run fit with all sets of parameters. Parameters @@ -600,11 +594,8 @@ def fit(self, X, y=None, sample_weight=None): y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. - - sample_weight : array-like, shape = [n_samples], optional - Sample weights. """ - return self._fit(X, y, sample_weight, ParameterGrid(self.param_grid)) + return self._fit(X, y, ParameterGrid(self.param_grid)) class RandomizedSearchCV(BaseSearchCV): @@ -730,7 +721,8 @@ class RandomizedSearchCV(BaseSearchCV): def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, - verbose=0, pre_dispatch='2*n_jobs', random_state=None): + verbose=0, pre_dispatch='2*n_jobs', random_state=None, + scorer_params=None): self.param_distributions = param_distributions self.n_iter = n_iter @@ -738,9 +730,9 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, super(RandomizedSearchCV, self).__init__( estimator=estimator, scoring=scoring, fit_params=fit_params, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, - pre_dispatch=pre_dispatch) + pre_dispatch=pre_dispatch, scorer_params=scorer_params) - def fit(self, X, y=None, sample_weight=None): + def fit(self, X, y=None): """Run fit on the estimator with randomly drawn parameters. Parameters @@ -752,12 +744,8 @@ def fit(self, X, y=None, sample_weight=None): y : array-like, shape = [n_samples] or [n_samples, n_output], optional Target relative to X for classification or regression; None for unsupervised learning. - - sample_weight : array-like, shape = [n_samples], optional - Sample weights. - """ sampled_params = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) - return self._fit(X, y, sample_weight, sampled_params) + return self._fit(X, y, sampled_params) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 835a4ec9ac398..3b9bff5613561 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -136,10 +136,15 @@ def learning_curve(estimator, X, y, sample_weight=None, classes, train, test, train_sizes_abs, scorer, verbose) for train, test in cv) else: + if sample_weight is not None: + params = dict(sample_weight=sample_weight) + else: + params = None out = parallel(delayed(_fit_and_score)( - clone(estimator), X, y, sample_weight, + clone(estimator), X, y, scorer, train[:n_train_samples], test, - verbose, parameters=None, fit_params=None, return_train_score=True) + verbose, parameters=None, fit_params=params, scorer_params=params, + return_train_score=True) for train, test in cv for n_train_samples in train_sizes_abs) out = np.array(out)[:, :2] n_cv_folds = out.shape[0] // n_unique_ticks @@ -217,20 +222,26 @@ def _incremental_fit_estimator(estimator, X, y, sample_weight, partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) for n_train_samples, partial_train in partitions: train_subset = train[:n_train_samples] - X_train, y_train, sample_weight_train = _safe_split( - estimator, X, y, sample_weight, train_subset) - X_partial_train, y_partial_train, sample_weight_partial_train = \ - _safe_split(estimator, X, y, sample_weight, partial_train) - X_test, y_test, sample_weight_test = _safe_split( - estimator, X, y, sample_weight, test, train_subset) + X_train, y_train = _safe_split( + estimator, X, y, train_subset) + X_partial_train, y_partial_train = \ + _safe_split(estimator, X, y, partial_train) + X_test, y_test = _safe_split( + estimator, X, y, test, train_subset) + + # TODO: replace sample_weight with fit_params and scorer_params fit_params = dict() - train_score_params = dict() - test_score_params = dict() + train_scorer_params = dict() + test_scorer_params = dict() if sample_weight is not None: + sample_weight = np.asarray(sample_weight) + sample_weight_train = sample_weight[train_subset] + sample_weight_partial_train = sample_weight[partial_train] + sample_weight_test = sample_weight[test] fit_params['sample_weight'] = sample_weight_partial_train - train_score_params['sample_weight'] = sample_weight_train - test_score_params['sample_weight'] = sample_weight_test + train_scorer_params['sample_weight'] = sample_weight_train + test_scorer_params['sample_weight'] = sample_weight_test if y_partial_train is None: estimator.partial_fit(X_partial_train, @@ -239,9 +250,9 @@ def _incremental_fit_estimator(estimator, X, y, sample_weight, estimator.partial_fit(X_partial_train, y_partial_train, classes=classes, **fit_params) train_scores.append(_score( - estimator, X_train, y_train, scorer, **train_score_params)) + estimator, X_train, y_train, scorer, **train_scorer_params)) test_scores.append(_score( - estimator, X_test, y_test, scorer, **test_score_params)) + estimator, X_test, y_test, scorer, **test_scorer_params)) return np.array((train_scores, test_scores)).T @@ -320,9 +331,14 @@ def validation_curve(estimator, X, y, param_name, param_range, parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) + if sample_weight is not None: + params = dict(sample_weight=sample_weight) + else: + params = None out = parallel(delayed(_fit_and_score)( - estimator, X, y, sample_weight, scorer, train, test, verbose, - parameters={param_name: v}, fit_params=None, return_train_score=True) + estimator, X, y, scorer, train, test, verbose, + parameters={param_name: v}, fit_params=params, scorer_params=params, + return_train_score=True) for train, test in cv for v in param_range) out = np.asarray(out)[:, :2] diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index bdb791ad080f7..f0ccca66da02c 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -468,7 +468,9 @@ def test_cross_val_score(): for a in range(-10, 10): clf.a = a # Smoke test - scores = cval.cross_val_score(clf, X, y, sample_weight=int_weights) + params = dict(sample_weight=int_weights) + scores = cval.cross_val_score(clf, X, y, + fit_params=params, scorer_params=params) assert_array_equal(scores, clf.score(X, y, sample_weight=int_weights)) # test with multioutput y @@ -483,8 +485,9 @@ def test_cross_val_score(): assert_array_equal(scores, clf.score(X_sparse, X)) # test with sample_weight as list + params = dict(sample_weight=int_weights.tolist()) scores = cval.cross_val_score( - clf, X, y, sample_weight=int_weights.tolist()) + clf, X, y, fit_params=params, scorer_params=params) # test with X and y as list list_check = lambda x: isinstance(x, list) @@ -920,12 +923,12 @@ def test_safe_split_with_precomputed_kernel(): cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0) tr, te = list(cv)[0] - X_tr, y_tr, _ = cval._safe_split(clf, X, y, None, tr) - K_tr, y_tr2, _ = cval._safe_split(clfp, K, y, None, tr) + X_tr, y_tr = cval._safe_split(clf, X, y, tr) + K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr) assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) - X_te, y_te, _ = cval._safe_split(clf, X, y, None, te, tr) - K_te, y_te2, _ = cval._safe_split(clfp, K, y, None, te, tr) + X_te, y_te = cval._safe_split(clf, X, y, te, tr) + K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr) assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T)) diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index b1573abbb4fb7..81245a6ccf22f 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -656,8 +656,12 @@ def test_grid_search_with_sample_weights(): est_parameters = {"foo_param": [1, 2, 3]} cv = KFold(y.shape[0], n_folds=2, random_state=0) for search_cls in (GridSearchCV, RandomizedSearchCV): - grid_search = search_cls(MockClassifier(), est_parameters, cv=cv) - grid_search.fit(X, y, sample_weight=sample_weight) + params=dict(sample_weight=sample_weight) + grid_search = search_cls(MockClassifier(), est_parameters, cv=cv, + fit_params=params, scorer_params=params) + grid_search.fit(X, y) # check that sample_weight can be a list - grid_search = GridSearchCV(MockClassifier(), est_parameters, cv=cv) - grid_search.fit(X, y, sample_weight=sample_weight.tolist()) + params=dict(sample_weight=sample_weight.tolist()) + grid_search = GridSearchCV(MockClassifier(), est_parameters, cv=cv, + fit_params=params, scorer_params=params) + grid_search.fit(X, y) diff --git a/sklearn/tests/test_learning_curve.py b/sklearn/tests/test_learning_curve.py index 62a05dd19799e..c039567669274 100644 --- a/sklearn/tests/test_learning_curve.py +++ b/sklearn/tests/test_learning_curve.py @@ -25,7 +25,7 @@ def __init__(self, n_max_train_sizes): self.train_sizes = 0 self.X_subset = None - def fit(self, X_subset, y_subset=None): + def fit(self, X_subset, y_subset=None, **params): self.X_subset = X_subset self.train_sizes = X_subset.shape[0] return self @@ -65,7 +65,7 @@ def __init__(self, param=0.5): self.X_subset = None self.param = param - def fit(self, X_subset, y_subset): + def fit(self, X_subset, y_subset, **params): self.X_subset = X_subset self.train_sizes = X_subset.shape[0] return self