From edbe7b7c0328111623189a7747c447a7ab221b56 Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Sat, 24 Aug 2013 01:54:33 -0700 Subject: [PATCH 1/5] scorer: add sample_weight support --- sklearn/metrics/scorer.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/sklearn/metrics/scorer.py b/sklearn/metrics/scorer.py index 1d26671b67851..6e641f7efbd7d 100644 --- a/sklearn/metrics/scorer.py +++ b/sklearn/metrics/scorer.py @@ -39,7 +39,7 @@ def __init__(self, score_func, sign, kwargs): self._sign = sign @abstractmethod - def __call__(self, estimator, X, y): + def __call__(self, estimator, X, y, sample_weight=None): pass def __repr__(self): @@ -56,7 +56,7 @@ def _factory_args(self): class _PredictScorer(_BaseScorer): - def __call__(self, estimator, X, y_true): + def __call__(self, estimator, X, y_true, sample_weight=None): """Evaluate predicted target values for X relative to y_true. Parameters @@ -71,17 +71,24 @@ def __call__(self, estimator, X, y_true): y_true : array-like Gold standard target values for X. + sample_weight : array-like, optional (default=None) + Sample weights. + Returns ------- score : float Score function applied to prediction of estimator on X. """ y_pred = estimator.predict(X) + if sample_weight is not None: + return self._sign * self._score_func(y_true, y_pred, + sample_weight=sample_weight, + **self._kwargs) return self._sign * self._score_func(y_true, y_pred, **self._kwargs) class _ProbaScorer(_BaseScorer): - def __call__(self, clf, X, y): + def __call__(self, clf, X, y, sample_weight=None): """Evaluate predicted probabilities for X relative to y_true. Parameters @@ -97,12 +104,19 @@ def __call__(self, clf, X, y): Gold standard target values for X. These must be class labels, not probabilities. + sample_weight : array-like, optional (default=None) + Sample weights. + Returns ------- score : float Score function applied to prediction of estimator on X. """ y_pred = clf.predict_proba(X) + if sample_weight is not None: + return self._sign * self._score_func(y, y_pred, + sample_weight=sample_weight, + **self._kwargs) return self._sign * self._score_func(y, y_pred, **self._kwargs) def _factory_args(self): @@ -110,7 +124,7 @@ def _factory_args(self): class _ThresholdScorer(_BaseScorer): - def __call__(self, clf, X, y): + def __call__(self, clf, X, y, sample_weight=None): """Evaluate decision function output for X relative to y_true. Parameters @@ -128,6 +142,9 @@ def __call__(self, clf, X, y): Gold standard target values for X. These must be class labels, not decision function values. + sample_weight : array-like, optional (default=None) + Sample weights. + Returns ------- score : float @@ -152,6 +169,10 @@ def __call__(self, clf, X, y): elif isinstance(y_pred, list): y_pred = np.vstack([p[:, -1] for p in y_pred]).T + if sample_weight is not None: + return self._sign * self._score_func(y, y_pred, + sample_weight=sample_weight, + **self._kwargs) return self._sign * self._score_func(y, y_pred, **self._kwargs) def _factory_args(self): From 3da7fb708e67dd27d7ef26b40d29447b7dc565d7 Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Sun, 6 Apr 2014 15:08:37 -0700 Subject: [PATCH 2/5] grid_search: add sample_weight support --- sklearn/cross_validation.py | 31 ++++++++++---- sklearn/grid_search.py | 56 ++++++++++++++++++-------- sklearn/tests/test_cross_validation.py | 8 ++-- sklearn/tests/test_grid_search.py | 27 ++++++++++++- 4 files changed, 93 insertions(+), 29 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index 92fc1c400d063..fe03fe5e4c75b 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1146,7 +1146,8 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, return np.array(scores)[:, 0] -def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, +def _fit_and_score(estimator, X, y, sample_weight, + scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False): """Fit estimator and compute scores for a given dataset split. @@ -1159,10 +1160,13 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, X : array-like of shape at least 2D The data to fit. - y : array-like, optional, default: None + y : array-like or None The target variable to try to predict in the case of supervised learning. + sample_weight : array-like or None + Sample weights. + scoring : callable A scorer callable object / function with signature ``scorer(estimator, X, y)``. @@ -1227,13 +1231,26 @@ def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, X_train, y_train = _safe_split(estimator, X, y, train) X_test, y_test = _safe_split(estimator, X, y, test, train) + + test_score_params = dict() + train_score_params = dict() + if sample_weight is not None: + # move to _safe_split? + sample_weight_train = sample_weight[safe_mask(sample_weight, train)] + sample_weight_test = sample_weight[safe_mask(sample_weight, test)] + fit_params['sample_weight'] = sample_weight_train + test_score_params['sample_weight'] = sample_weight_test + train_score_params['sample_weight'] = sample_weight_train + if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) - test_score = _score(estimator, X_test, y_test, scorer) + test_score = _score(estimator, X_test, y_test, scorer, + **test_score_params) if return_train_score: - train_score = _score(estimator, X_train, y_train, scorer) + train_score = _score(estimator, X_train, y_train, scorer, + **train_score_params) scoring_time = time.time() - start_time @@ -1282,12 +1299,12 @@ def _safe_split(estimator, X, y, indices, train_indices=None): return X_subset, y_subset -def _score(estimator, X_test, y_test, scorer): +def _score(estimator, X_test, y_test, scorer, **params): """Compute the score of an estimator on a given test set.""" if y_test is None: - score = scorer(estimator, X_test) + score = scorer(estimator, X_test, **params) else: - score = scorer(estimator, X_test, y_test) + score = scorer(estimator, X_test, y_test, **params) if not isinstance(score, numbers.Number): raise ValueError("scoring must return a number, got %s (%s) instead." % (str(score), type(score))) diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py index e70209fc04ed6..aec1e063fd31a 100644 --- a/sklearn/grid_search.py +++ b/sklearn/grid_search.py @@ -8,6 +8,7 @@ # Gael Varoquaux # Andreas Mueller # Olivier Grisel +# Noel Dawe # License: BSD 3 clause from abc import ABCMeta, abstractmethod @@ -228,7 +229,8 @@ def fit_grid_point(X, y, estimator, parameters, train, test, scorer, n_samples_test : int Number of test samples in this split. """ - score, n_samples_test, _ = _fit_and_score(estimator, X, y, scorer, train, + score, n_samples_test, _ = _fit_and_score(estimator, X, y, None, + scorer, train, test, verbose, parameters, fit_params) return score, parameters, n_samples_test @@ -295,7 +297,7 @@ def __init__(self, estimator, scoring=None, loss_func=None, self.verbose = verbose self.pre_dispatch = pre_dispatch - def score(self, X, y=None): + def score(self, X, y=None, sample_weight=None): """Returns the score on the given test data and labels, if the search estimator has been refit. The ``score`` function of the best estimator is used, or the ``scoring`` parameter where unavailable. @@ -310,18 +312,24 @@ def score(self, X, y=None): Target relative to X for classification or regression; None for unsupervised learning. + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + Returns ------- score : float """ + kwargs = {} + if sample_weight is not None: + kwargs['sample_weight'] = sample_weight if hasattr(self.best_estimator_, 'score'): - return self.best_estimator_.score(X, y) + return self.best_estimator_.score(X, y, **kwargs) if self.scorer_ is None: raise ValueError("No score function explicitly defined, " "and the estimator doesn't provide one %s" % self.best_estimator_) - return self.scorer_(self.best_estimator_, X, y) + return self.scorer_(self.best_estimator_, X, y, **kwargs) @property def predict(self): @@ -339,7 +347,7 @@ def decision_function(self): def transform(self): return self.best_estimator_.transform - def _fit(self, X, y, parameter_iterable): + def _fit(self, X, y, sample_weight, parameter_iterable): """Actual fitting, performing the search over parameters.""" estimator = self.estimator @@ -349,8 +357,10 @@ def _fit(self, X, y, parameter_iterable): score_func=self.score_func) n_samples = _num_samples(X) - X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr', - allow_nans=True) + X, y, sample_weight = check_arrays(X, y, sample_weight, + allow_lists=True, + sparse_format='csr', + allow_nans=True) if y is not None: if len(y) != n_samples: @@ -358,6 +368,10 @@ def _fit(self, X, y, parameter_iterable): 'of samples (%i) than data (X: %i samples)' % (len(y), n_samples)) y = np.asarray(y) + + if sample_weight is not None: + sample_weight = np.asarray(sample_weight) + cv = check_cv(cv, X, y, classifier=is_classifier(estimator)) if self.verbose > 0: @@ -375,9 +389,10 @@ def _fit(self, X, y, parameter_iterable): n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=pre_dispatch )( - delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_, - train, test, self.verbose, parameters, - self.fit_params, return_parameters=True) + delayed(_fit_and_score)(clone(base_estimator), X, y, sample_weight, + self.scorer_, train, test, + self.verbose, parameters, self.fit_params, + return_parameters=True) for parameters in parameter_iterable for train, test in cv) @@ -419,14 +434,18 @@ def _fit(self, X, y, parameter_iterable): self.best_score_ = best.mean_validation_score if self.refit: + fit_params = self.fit_params + if sample_weight is not None: + fit_params = fit_params.copy() + fit_params['sample_weight'] = sample_weight # fit the best estimator using the entire dataset # clone first to work around broken estimators best_estimator = clone(base_estimator).set_params( **best.parameters) if y is not None: - best_estimator.fit(X, y, **self.fit_params) + best_estimator.fit(X, y, **fit_params) else: - best_estimator.fit(X, **self.fit_params) + best_estimator.fit(X, **fit_params) self.best_estimator_ = best_estimator return self @@ -581,7 +600,7 @@ def __init__(self, estimator, param_grid, scoring=None, loss_func=None, self.param_grid = param_grid _check_param_grid(param_grid) - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """Run fit with all sets of parameters. Parameters @@ -595,8 +614,10 @@ def fit(self, X, y=None): Target relative to X for classification or regression; None for unsupervised learning. + sample_weight : array-like, shape = [n_samples], optional + Sample weights. """ - return self._fit(X, y, ParameterGrid(self.param_grid)) + return self._fit(X, y, sample_weight, ParameterGrid(self.param_grid)) class RandomizedSearchCV(BaseSearchCV): @@ -732,7 +753,7 @@ def __init__(self, estimator, param_distributions, n_iter=10, scoring=None, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose, pre_dispatch=pre_dispatch) - def fit(self, X, y=None): + def fit(self, X, y=None, sample_weight=None): """Run fit on the estimator with randomly drawn parameters. Parameters @@ -745,8 +766,11 @@ def fit(self, X, y=None): Target relative to X for classification or regression; None for unsupervised learning. + sample_weight : array-like, shape = [n_samples], optional + Sample weights. + """ sampled_params = ParameterSampler(self.param_distributions, self.n_iter, random_state=self.random_state) - return self._fit(X, y, sampled_params) + return self._fit(X, y, sample_weight, sampled_params) diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index f0088739476d9..e8d732e8023b7 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -866,12 +866,12 @@ def test_safe_split_with_precomputed_kernel(): cv = cval.ShuffleSplit(X.shape[0], test_size=0.25, random_state=0) tr, te = list(cv)[0] - X_tr, y_tr = cval._safe_split(clf, X, y, tr) - K_tr, y_tr2 = cval._safe_split(clfp, K, y, tr) + X_tr, y_tr, _ = cval._safe_split(clf, X, y, None, tr) + K_tr, y_tr2, _ = cval._safe_split(clfp, K, y, None, tr) assert_array_almost_equal(K_tr, np.dot(X_tr, X_tr.T)) - X_te, y_te = cval._safe_split(clf, X, y, te, tr) - K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr) + X_te, y_te, _ = cval._safe_split(clf, X, y, None, te, tr) + K_te, y_te2, _ = cval._safe_split(clfp, K, y, None, te, tr) assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T)) diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py index 61f2389779e7f..8f96d5a9dce38 100644 --- a/sklearn/tests/test_grid_search.py +++ b/sklearn/tests/test_grid_search.py @@ -50,8 +50,13 @@ class MockClassifier(object): def __init__(self, foo_param=0): self.foo_param = foo_param - def fit(self, X, Y): + def fit(self, X, Y, sample_weight=None): assert_true(len(X) == len(Y)) + if sample_weight is not None: + assert_true(len(sample_weight) == len(X), + 'MockClassifier sample_weight.shape[0]' + ' is {0}, should be {1}'.format(len(sample_weight), + len(X))) return self def predict(self, T): @@ -61,7 +66,12 @@ def predict(self, T): decision_function = predict transform = predict - def score(self, X=None, Y=None): + def score(self, X=None, Y=None, sample_weight=None): + if X is not None and sample_weight is not None: + assert_true(len(sample_weight) == len(X), + 'MockClassifier sample_weight.shape[0]' + ' is {0}, should be {1}'.format(len(sample_weight), + len(X))) if self.foo_param > 1: score = 1. else: @@ -115,6 +125,7 @@ def score(self): X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]]) y = np.array([1, 1, 2, 2]) +sample_weight = np.array([1, 2, 3, 4]) def test_parameter_grid(): @@ -668,3 +679,15 @@ def test_grid_search_allows_nans(): ('classifier', MockClassifier()), ]) gs = GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y) + + +def test_grid_search_with_sample_weights(): + """Test grid searching with sample weights""" + est_parameters = {"foo_param": [1, 2, 3]} + cv = KFold(y.shape[0], n_folds=2, random_state=0) + for search_cls in (GridSearchCV, RandomizedSearchCV): + grid_search = search_cls(MockClassifier(), est_parameters, cv=cv) + grid_search.fit(X, y, sample_weight=sample_weight) + # check that sample_weight can be a list + grid_search = GridSearchCV(MockClassifier(), est_parameters, cv=cv) + grid_search.fit(X, y, sample_weight=sample_weight.tolist()) From 10df65ef4e115981cd3ea0d2473b0c3e54bb373e Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Mon, 21 Apr 2014 18:53:08 -0700 Subject: [PATCH 3/5] cross_validation: add sample_weight support --- sklearn/cross_validation.py | 41 +++++++++++++++++--------- sklearn/tests/test_cross_validation.py | 16 ++++++---- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py index fe03fe5e4c75b..cf7189cbbfb2c 100644 --- a/sklearn/cross_validation.py +++ b/sklearn/cross_validation.py @@ -1068,8 +1068,8 @@ def __len__(self): ############################################################################## - -def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, +def cross_val_score(estimator, X, y=None, sample_weight=None, + scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, score_func=None, pre_dispatch='2*n_jobs'): """Evaluate a score by cross-validation @@ -1086,6 +1086,9 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, The target variable to try to predict in the case of supervised learning. + sample_weight : array-like, optional, default: None + Sample weights. + scoring : string, callable or None, optional, default: None A string (see model evaluation documentation) or a scorer callable object / function with signature @@ -1128,10 +1131,14 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, scores : array of float, shape=(len(list(cv)),) Array of scores of the estimator for each run of the cross validation. """ - X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True, - allow_nans=True) + X, y, sample_weight = check_arrays(X, y, sample_weight, + sparse_format='csr', + allow_lists=True, + allow_nans=True) if y is not None: y = np.asarray(y) + if sample_weight is not None: + sample_weight = np.asarray(sample_weight) cv = _check_cv(cv, X, y, classifier=is_classifier(estimator)) scorer = check_scoring(estimator, score_func=score_func, scoring=scoring) @@ -1139,7 +1146,8 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1, # independent, and that it is pickle-able. parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch) - scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, scorer, + scores = parallel(delayed(_fit_and_score)(clone(estimator), X, y, + sample_weight, scorer, train, test, verbose, None, fit_params) for train, test in cv) @@ -1229,15 +1237,15 @@ def _fit_and_score(estimator, X, y, sample_weight, start_time = time.time() - X_train, y_train = _safe_split(estimator, X, y, train) - X_test, y_test = _safe_split(estimator, X, y, test, train) + X_train, y_train, sample_weight_train = _safe_split( + estimator, X, y, sample_weight, train) + X_test, y_test, sample_weight_test = _safe_split( + estimator, X, y, sample_weight, test, train) - test_score_params = dict() - train_score_params = dict() + test_score_params = {} + train_score_params = {} if sample_weight is not None: - # move to _safe_split? - sample_weight_train = sample_weight[safe_mask(sample_weight, train)] - sample_weight_test = sample_weight[safe_mask(sample_weight, test)] + fit_params = fit_params.copy() fit_params['sample_weight'] = sample_weight_train test_score_params['sample_weight'] = sample_weight_test train_score_params['sample_weight'] = sample_weight_train @@ -1267,7 +1275,7 @@ def _fit_and_score(estimator, X, y, sample_weight, return ret -def _safe_split(estimator, X, y, indices, train_indices=None): +def _safe_split(estimator, X, y, sample_weight, indices, train_indices=None): """Create subset of dataset and properly handle kernels.""" if hasattr(estimator, 'kernel') and callable(estimator.kernel): # cannot compute the kernel values with custom function @@ -1296,7 +1304,12 @@ def _safe_split(estimator, X, y, indices, train_indices=None): else: y_subset = None - return X_subset, y_subset + if sample_weight is not None: + sample_weight_subset = np.asarray(sample_weight)[indices] + else: + sample_weight_subset = None + + return X_subset, y_subset, sample_weight_subset def _score(estimator, X_test, y_test, scorer, **params): diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py index e8d732e8023b7..33b24c9a2d96e 100644 --- a/sklearn/tests/test_cross_validation.py +++ b/sklearn/tests/test_cross_validation.py @@ -71,9 +71,9 @@ def __init__(self, a=0): def fit(self, X, Y=None, sample_weight=None, class_prior=None): if sample_weight is not None: - assert_true(sample_weight.shape[0] == X.shape[0], + assert_true(len(sample_weight) == X.shape[0], 'MockClassifier extra fit_param sample_weight.shape[0]' - ' is {0}, should be {1}'.format(sample_weight.shape[0], + ' is {0}, should be {1}'.format(len(sample_weight), X.shape[0])) if class_prior is not None: assert_true(class_prior.shape[0] == len(np.unique(y)), @@ -85,13 +85,15 @@ def fit(self, X, Y=None, sample_weight=None, class_prior=None): def predict(self, T): return T.shape[0] - def score(self, X=None, Y=None): + def score(self, X=None, Y=None, sample_weight=None): return 1. / (1 + np.abs(self.a)) X = np.ones((10, 2)) X_sparse = coo_matrix(X) y = np.arange(10) // 2 +rng = np.random.RandomState(0) +int_weights = rng.randint(10, size=y.shape) ############################################################################## # Tests @@ -488,8 +490,8 @@ def test_cross_val_score(): for a in range(-10, 10): clf.a = a # Smoke test - scores = cval.cross_val_score(clf, X, y) - assert_array_equal(scores, clf.score(X, y)) + scores = cval.cross_val_score(clf, X, y, sample_weight=int_weights) + assert_array_equal(scores, clf.score(X, y, sample_weight=int_weights)) # test with multioutput y scores = cval.cross_val_score(clf, X_sparse, X) @@ -502,6 +504,10 @@ def test_cross_val_score(): scores = cval.cross_val_score(clf, X_sparse, X) assert_array_equal(scores, clf.score(X_sparse, X)) + # test with sample_weight as list + scores = cval.cross_val_score( + clf, X, y, sample_weight=int_weights.tolist()) + # test with X and y as list clf = MockListClassifier() scores = cval.cross_val_score(clf, X.tolist(), y.tolist()) From 135d7e1c0285c0ff40ea0962e237fe6ec4b134ab Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Mon, 21 Apr 2014 18:31:40 -0700 Subject: [PATCH 4/5] rfe: sample_weight support --- sklearn/feature_selection/rfe.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index 01c99ceb526f4..12f46d139d079 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -305,7 +305,7 @@ def __init__(self, estimator, step=1, cv=None, scoring=None, self.estimator_params = estimator_params self.verbose = verbose - def fit(self, X, y): + def fit(self, X, y, sample_weight=None): """Fit the RFE model and automatically tune the number of selected features. @@ -318,6 +318,9 @@ def fit(self, X, y): y : array-like, shape = [n_samples] Target values (integers for classification, real numbers for regression). + + sample_weight : array-like, shape = [n_samples], optional (default=None) + Sample weights. """ X, y = check_arrays(X, y, sparse_format="csr") # Initialization @@ -332,17 +335,26 @@ def fit(self, X, y): # Cross-validation for n, (train, test) in enumerate(cv): - X_train, y_train = _safe_split(self.estimator, X, y, train) - X_test, y_test = _safe_split(self.estimator, X, y, test, train) + X_train, y_train, sample_weight_train = _safe_split( + self.estimator, X, y, sample_weight, train) + X_test, y_test, sample_weight_test = _safe_split( + self.estimator, X, y, sample_weight, test, train) + + fit_params = dict() + score_params = dict() + if sample_weight is not None: + fit_params['sample_weight'] = sample_weight_train + score_params['sample_weight'] = sample_weight_test # Compute a full ranking of the features - ranking_ = rfe.fit(X_train, y_train).ranking_ + ranking_ = rfe.fit(X_train, y_train, **fit_params).ranking_ # Score each subset of features for k in range(0, max(ranking_)): mask = np.where(ranking_ <= k + 1)[0] estimator = clone(self.estimator) - estimator.fit(X_train[:, mask], y_train) - score = _score(estimator, X_test[:, mask], y_test, scorer) + estimator.fit(X_train[:, mask], y_train, **fit_params) + score = _score( + estimator, X_test[:, mask], y_test, scorer, **score_params) if self.verbose > 0: print("Finished fold with %d / %d feature ranks, score=%f" @@ -358,7 +370,10 @@ def fit(self, X, y): n_features_to_select=k+1, step=self.step, estimator_params=self.estimator_params) - rfe.fit(X, y) + if sample_weight is not None: + rfe.fit(X, y, sample_weight=sample_weight) + else: + rfe.fit(X, y) # Set final attributes self.support_ = rfe.support_ From bf4cd4cc15a7598590bd27ea5efd1ae76336ea63 Mon Sep 17 00:00:00 2001 From: Noel Dawe Date: Mon, 21 Apr 2014 18:32:01 -0700 Subject: [PATCH 5/5] learning_curve: sample_weight support --- sklearn/learning_curve.py | 55 ++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py index 51d1565bc811c..0ad568d42d29b 100644 --- a/sklearn/learning_curve.py +++ b/sklearn/learning_curve.py @@ -14,7 +14,8 @@ from .metrics.scorer import check_scoring -def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), +def learning_curve(estimator, X, y, sample_weight=None, + train_sizes=np.linspace(0.1, 1.0, 10), cv=None, scoring=None, exploit_incremental_learning=False, n_jobs=1, pre_dispatch="all", verbose=0): """Learning curve. @@ -41,6 +42,9 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), Target relative to X for classification or regression; None for unsupervised learning. + sample_weight : array-like, shape (n_samples), optional + Sample weights. + train_sizes : array-like, shape (n_ticks,), dtype float or int Relative or absolute numbers of training examples that will be used to generate the learning curve. If the dtype is float, it is regarded as a @@ -124,11 +128,13 @@ def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5), if exploit_incremental_learning: classes = np.unique(y) if is_classifier(estimator) else None out = parallel(delayed(_incremental_fit_estimator)( - clone(estimator), X, y, classes, train, test, train_sizes_abs, + clone(estimator), X, y, sample_weight, + classes, train, test, train_sizes_abs, scorer, verbose) for train, test in cv) else: out = parallel(delayed(_fit_and_score)( - clone(estimator), X, y, scorer, train[:n_train_samples], test, + clone(estimator), X, y, sample_weight, + scorer, train[:n_train_samples], test, verbose, parameters=None, fit_params=None, return_train_score=True) for train, test in cv for n_train_samples in train_sizes_abs) out = np.array(out)[:, :2] @@ -199,29 +205,45 @@ def _translate_train_sizes(train_sizes, n_max_training_samples): return train_sizes_abs -def _incremental_fit_estimator(estimator, X, y, classes, train, test, +def _incremental_fit_estimator(estimator, X, y, sample_weight, + classes, train, test, train_sizes, scorer, verbose): """Train estimator on training subsets incrementally and compute scores.""" train_scores, test_scores = [], [] partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) for n_train_samples, partial_train in partitions: train_subset = train[:n_train_samples] - X_train, y_train = _safe_split(estimator, X, y, train_subset) - X_partial_train, y_partial_train = _safe_split(estimator, X, y, - partial_train) - X_test, y_test = _safe_split(estimator, X, y, test, train_subset) + X_train, y_train, sample_weight_train = _safe_split( + estimator, X, y, sample_weight, train_subset) + X_partial_train, y_partial_train, sample_weight_partial_train = \ + _safe_split(estimator, X, y, sample_weight, partial_train) + X_test, y_test, sample_weight_test = _safe_split( + estimator, X, y, sample_weight, test, train_subset) + + fit_params = dict() + train_score_params = dict() + test_score_params = dict() + if sample_weight is not None: + fit_params['sample_weight'] = sample_weight_partial_train + train_score_params['sample_weight'] = sample_weight_train + test_score_params['sample_weight'] = sample_weight_test + if y_partial_train is None: - estimator.partial_fit(X_partial_train, classes=classes) + estimator.partial_fit(X_partial_train, + classes=classes, **fit_params) else: estimator.partial_fit(X_partial_train, y_partial_train, - classes=classes) - train_scores.append(_score(estimator, X_train, y_train, scorer)) - test_scores.append(_score(estimator, X_test, y_test, scorer)) + classes=classes, **fit_params) + train_scores.append(_score( + estimator, X_train, y_train, scorer, **train_score_params)) + test_scores.append(_score( + estimator, X_test, y_test, scorer, **test_score_params)) return np.array((train_scores, test_scores)).T -def validation_curve(estimator, X, y, param_name, param_range, cv=None, - scoring=None, n_jobs=1, pre_dispatch="all", verbose=0): +def validation_curve(estimator, X, y, param_name, param_range, + sample_weight=None, cv=None, scoring=None, + n_jobs=1, pre_dispatch="all", verbose=0): """Validation curve. Determine training and test scores for varying parameter values. @@ -250,6 +272,9 @@ def validation_curve(estimator, X, y, param_name, param_range, cv=None, param_range : array-like, shape (n_values,) The values of the parameter that will be evaluated. + sample_weight : array-like, shape (n_samples,), optional + Sample weights. + cv : integer, cross-validation generator, optional If an integer is passed, it is the number of folds (defaults to 3). Specific cross-validation objects can be passed, see @@ -291,7 +316,7 @@ def validation_curve(estimator, X, y, param_name, param_range, cv=None, parallel = Parallel(n_jobs=n_jobs, pre_dispatch=pre_dispatch, verbose=verbose) out = parallel(delayed(_fit_and_score)( - estimator, X, y, scorer, train, test, verbose, + estimator, X, y, sample_weight, scorer, train, test, verbose, parameters={param_name: v}, fit_params=None, return_train_score=True) for train, test in cv for v in param_range)