diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst index 0a404d2d07946..7d65806acb807 100644 --- a/doc/modules/model_evaluation.rst +++ b/doc/modules/model_evaluation.rst @@ -173,7 +173,7 @@ Here is an example of building custom scorers, and of using the >>> # and predictions defined below. >>> loss = make_scorer(my_custom_loss_func, greater_is_better=False) >>> score = make_scorer(my_custom_loss_func, greater_is_better=True) - >>> ground_truth = [[1, 1]] + >>> ground_truth = [[1], [1]] >>> predictions = [0, 1] >>> from sklearn.dummy import DummyClassifier >>> clf = DummyClassifier(strategy='most_frequent', random_state=0) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index dfc2be923eff2..bb52411e2fba4 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -213,7 +213,8 @@ Bug fixes - Fixed a bug where :class:`sklearn.linear_model.LassoLars` does not give the same result as the LassoLars implementation available - in R (lars library). :issue:`7849` by :user:`Jair Montoya Martinez ` + in R (lars library). :issue:`7849` by :user:`Jair Montoya Martinez `. + - Some ``fetch_`` functions in `sklearn.datasets` were ignoring the ``download_if_missing`` keyword. This was fixed in :issue:`7944` by :user:`Ralf Gommers `. @@ -223,9 +224,9 @@ Bug fixes where a float being compared to ``0.0`` using ``==`` caused a divide by zero error. This was fixed in :issue:`7970` by :user:`He Chen `. - - Fix a bug regarding fitting :class:`sklearn.cluster.KMeans` with a - sparse array X and initial centroids, where X's means were unnecessarily - being subtracted from the centroids. :issue:`7872` by `Josh Karnofsky `_. + - Fix a bug regarding fitting :class:`sklearn.cluster.KMeans` with a sparse + array X and initial centroids, where X's means were unnecessarily being + subtracted from the centroids. :issue:`7872` by :user:`Josh Karnofsky `. - Fix estimators to accept a ``sample_weight`` parameter of type ``pandas.Series`` in their ``fit`` function. :issue:`7825` by @@ -249,6 +250,20 @@ Bug fixes ``min_impurity_split`` parameter. :issue:`8006` by :user:`Sebastian Pölsterl `. + - Fixes to the input validation in + :class:`sklearn.covariance.EllipticEnvelope`. + :issue:`8086` by `Andreas Müller`_. + + - Fix output shape and bugs with n_jobs > 1 in + :class:`sklearn.decomposition.SparseCoder` transform and :func:`sklarn.decomposition.sparse_encode` + for one-dimensional data and one component. + This also impacts the output shape of :class:`sklearn.decomposition.DictionaryLearning`. + :issue:`8086` by `Andreas Müller`_. + + - Several fixes to input validation in + :class:`multiclass.OutputCodeClassifier` + :issue:`8086` by `Andreas Müller`_. + - Fix a bug where :class:`sklearn.ensemble.gradient_boosting.QuantileLossFunction` computed negative errors for negative values of ``ytrue - ypred`` leading to @@ -336,6 +351,20 @@ API changes summary :func:`sklearn.model_selection.cross_val_predict`. :issue:`2879` by :user:`Stephen Hoover `. + + - Gradient boosting base models are no longer estimators. By `Andreas Müller`_. + + - :class:`feature_selection.SelectFromModel` now validates the ``threshold`` + parameter and sets the ``threshold_`` attribute during the call to + ``fit``, and no longer during the call to ``transform```, by `Andreas + Müller`_. + + - :class:`feature_selection.SelectFromModel` now has a ``partial_fit`` + method only if the underlying estimator does. By `Andreas Müller`_. + + - :class:`multiclass.OneVsRestClassifier` now has a ``partial_fit`` method + only if the underlying estimator does. By `Andreas Müller`_. + - Estimators with both methods ``decision_function`` and ``predict_proba`` are now required to have a monotonic relation between them. The method ``check_decision_proba_consistency`` has been added in diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py index 31307e55801a5..38fcff94d7505 100644 --- a/sklearn/cluster/tests/test_k_means.py +++ b/sklearn/cluster/tests/test_k_means.py @@ -404,7 +404,7 @@ def test_minibatch_sensible_reassign_partial_fit(): def test_minibatch_reassign(): # Give a perfect initialization, but a large reassignment_ratio, # as a result all the centers should be reassigned and the model - # should not longer be good + # should no longer be good for this_X in (X, X_csr): mb_k_means = MiniBatchKMeans(n_clusters=n_clusters, batch_size=100, random_state=42) diff --git a/sklearn/covariance/outlier_detection.py b/sklearn/covariance/outlier_detection.py index 1cafe885fdd47..9fe219ba5d0b6 100644 --- a/sklearn/covariance/outlier_detection.py +++ b/sklearn/covariance/outlier_detection.py @@ -15,8 +15,8 @@ import numpy as np import scipy as sp from . import MinCovDet -from ..base import ClassifierMixin -from ..utils.validation import check_is_fitted +from ..utils.validation import check_is_fitted, check_array +from ..metrics import accuracy_score class OutlierDetectionMixin(object): @@ -63,11 +63,11 @@ def decision_function(self, X, raw_values=False): """ check_is_fitted(self, 'threshold_') + X = check_array(X) mahal_dist = self.mahalanobis(X) if raw_values: decision = mahal_dist else: - check_is_fitted(self, 'threshold_') transformed_mahal_dist = mahal_dist ** 0.33 decision = self.threshold_ ** 0.33 - transformed_mahal_dist @@ -91,6 +91,7 @@ def predict(self, X): """ check_is_fitted(self, 'threshold_') + X = check_array(X) is_inlier = -np.ones(X.shape[0], dtype=int) if self.contamination is not None: values = self.decision_function(X, raw_values=True) @@ -100,8 +101,34 @@ def predict(self, X): return is_inlier + def score(self, X, y, sample_weight=None): + """Returns the mean accuracy on the given test data and labels. -class EllipticEnvelope(ClassifierMixin, OutlierDetectionMixin, MinCovDet): + In multi-label classification, this is the subset accuracy + which is a harsh metric since you require for each sample that + each label set be correctly predicted. + + Parameters + ---------- + X : array-like, shape = (n_samples, n_features) + Test samples. + + y : array-like, shape = (n_samples,) or (n_samples, n_outputs) + True labels for X. + + sample_weight : array-like, shape = (n_samples,), optional + Sample weights. + + Returns + ------- + score : float + Mean accuracy of self.predict(X) wrt. y. + + """ + return accuracy_score(y, self.predict(X), sample_weight=sample_weight) + + +class EllipticEnvelope(OutlierDetectionMixin, MinCovDet): """An object for detecting outliers in a Gaussian distributed dataset. Read more in the :ref:`User Guide `. diff --git a/sklearn/decomposition/dict_learning.py b/sklearn/decomposition/dict_learning.py index 154987a6279c4..b9bb0fcea864c 100644 --- a/sklearn/decomposition/dict_learning.py +++ b/sklearn/decomposition/dict_learning.py @@ -94,6 +94,11 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', if X.ndim == 1: X = X[:, np.newaxis] n_samples, n_features = X.shape + n_components = dictionary.shape[0] + if dictionary.shape[1] != X.shape[1]: + raise ValueError("Dictionary and X have different numbers of features:" + "dictionary.shape: {} X.shape{}".format( + dictionary.shape, X.shape)) if cov is None and algorithm != 'lasso_cd': # overwriting cov is safe copy_cov = False @@ -157,6 +162,8 @@ def _sparse_encode(X, dictionary, gram, cov=None, algorithm='lasso_lars', raise ValueError('Sparse coding method must be "lasso_lars" ' '"lasso_cd", "lasso", "threshold" or "omp", got %s.' % algorithm) + if new_code.ndim != 2: + return new_code.reshape(n_samples, n_components) return new_code @@ -281,10 +288,6 @@ def sparse_encode(X, dictionary, gram=None, cov=None, algorithm='lasso_lars', max_iter=max_iter, check_input=False, verbose=verbose) - # This ensure that dimensionality of code is always 2, - # consistant with the case n_jobs > 1 - if code.ndim == 1: - code = code[np.newaxis, :] return code # Enter parallel code block @@ -731,8 +734,8 @@ def dict_learning_online(X, n_components=2, alpha=1, n_iter=100, sys.stdout.flush() elif verbose: if verbose > 10 or ii % ceil(100. / verbose) == 0: - print ("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" - % (ii, dt, dt / 60)) + print("Iteration % 3i (elapsed time: % 3is, % 4.1fmn)" + % (ii, dt, dt / 60)) this_code = sparse_encode(this_X, dictionary.T, algorithm=method, alpha=alpha, n_jobs=n_jobs).T @@ -820,7 +823,6 @@ def transform(self, X, y=None): """ check_is_fitted(self, 'components_') - # XXX : kwargs is not documented X = check_array(X) n_samples, n_features = X.shape @@ -906,6 +908,7 @@ class SparseCoder(BaseEstimator, SparseCodingMixin): MiniBatchSparsePCA sparse_encode """ + _required_parameters = ["dictionary"] def __init__(self, dictionary, transform_algorithm='omp', transform_n_nonzero_coefs=None, transform_alpha=None, diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py index b7ed5c4703492..5bf9836aa6a9e 100644 --- a/sklearn/decomposition/tests/test_dict_learning.py +++ b/sklearn/decomposition/tests/test_dict_learning.py @@ -1,4 +1,5 @@ import numpy as np +import itertools from sklearn.exceptions import ConvergenceWarning @@ -25,10 +26,27 @@ X = rng_global.randn(n_samples, n_features) +def test_sparse_encode_shapes_omp(): + rng = np.random.RandomState(0) + algorithms = ['omp', 'lasso_lars', 'lasso_cd', 'lars', 'threshold'] + for n_components, n_samples in itertools.product([1, 5], [1, 9]): + X_ = rng.randn(n_samples, n_features) + dictionary = rng.randn(n_components, n_features) + for algorithm, n_jobs in itertools.product(algorithms, [1, 3]): + code = sparse_encode(X_, dictionary, algorithm=algorithm, + n_jobs=n_jobs) + assert_equal(code.shape, (n_samples, n_components)) + + def test_dict_learning_shapes(): n_components = 5 dico = DictionaryLearning(n_components, random_state=0).fit(X) - assert_true(dico.components_.shape == (n_components, n_features)) + assert_equal(dico.components_.shape, (n_components, n_features)) + + n_components = 1 + dico = DictionaryLearning(n_components, random_state=0).fit(X) + assert_equal(dico.components_.shape, (n_components, n_features)) + assert_equal(dico.transform(X).shape, (X.shape[0], n_components)) def test_dict_learning_overcomplete(): diff --git a/sklearn/decomposition/truncated_svd.py b/sklearn/decomposition/truncated_svd.py index c1fc1f3b18435..87b8b45e1543a 100644 --- a/sklearn/decomposition/truncated_svd.py +++ b/sklearn/decomposition/truncated_svd.py @@ -11,7 +11,7 @@ from scipy.sparse.linalg import svds from ..base import BaseEstimator, TransformerMixin -from ..utils import check_array, as_float_array, check_random_state +from ..utils import check_array, check_random_state from ..utils.extmath import randomized_svd, safe_sparse_dot, svd_flip from ..utils.sparsefuncs import mean_variance_axis @@ -153,13 +153,9 @@ def fit_transform(self, X, y=None): X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ - X = as_float_array(X, copy=False) + X = check_array(X, accept_sparse=['csr', 'csc']) random_state = check_random_state(self.random_state) - # If sparse and not csr or csc, convert to csr - if sp.issparse(X) and X.getformat() not in ["csr", "csc"]: - X = X.tocsr() - if self.algorithm == "arpack": U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol) # svds doesn't abide by scipy.linalg.svd/randomized_svd diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 90a43791c81b6..9d139454d6e2c 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -120,6 +120,8 @@ def fit(self, X, y, sample_weight=None): self.sparse_output_ = sp.issparse(y) + check_consistent_length(X, y) + if not self.sparse_output_: y = np.atleast_1d(y) @@ -184,7 +186,7 @@ def predict(self, X): classes_ = self.classes_ class_prior_ = self.class_prior_ constant = self.constant - if self.n_outputs_ == 1: + if self.n_outputs_ == 1 and not self.output_2d_: # Get same type even for self.n_outputs_ == 1 n_classes_ = [n_classes_] classes_ = [classes_] @@ -193,7 +195,7 @@ def predict(self, X): # Compute probability only once if self.strategy == "stratified": proba = self.predict_proba(X) - if self.n_outputs_ == 1: + if self.n_outputs_ == 1 and not self.output_2d_: proba = [proba] if self.sparse_output_: @@ -399,6 +401,7 @@ def fit(self, X, y, sample_weight=None): % self.strategy) y = check_array(y, ensure_2d=False) + if len(y) == 0: raise ValueError("y must not be empty.") diff --git a/sklearn/ensemble/base.py b/sklearn/ensemble/base.py index 5e9d6e2e1fc3c..2477cc1c21c7d 100644 --- a/sklearn/ensemble/base.py +++ b/sklearn/ensemble/base.py @@ -12,6 +12,8 @@ from ..base import BaseEstimator from ..base import MetaEstimatorMixin from ..utils import _get_n_jobs, check_random_state +from ..externals import six +from abc import ABCMeta, abstractmethod MAX_RAND_SEED = np.iinfo(np.int32).max @@ -55,7 +57,8 @@ def _set_random_states(estimator, random_state=None): estimator.set_params(**to_set) -class BaseEnsemble(BaseEstimator, MetaEstimatorMixin): +class BaseEnsemble(six.with_metaclass(ABCMeta, BaseEstimator, + MetaEstimatorMixin)): """Base class for all ensemble classes. Warning: This class should not be used directly. Use derived classes @@ -82,6 +85,7 @@ class BaseEnsemble(BaseEstimator, MetaEstimatorMixin): The collection of fitted base estimators. """ + @abstractmethod def __init__(self, base_estimator, n_estimators=10, estimator_params=tuple()): # Set parameters diff --git a/sklearn/ensemble/gradient_boosting.py b/sklearn/ensemble/gradient_boosting.py index 9ef973b8bbc56..8c16ccf78ffa6 100644 --- a/sklearn/ensemble/gradient_boosting.py +++ b/sklearn/ensemble/gradient_boosting.py @@ -64,7 +64,7 @@ from ..exceptions import NotFittedError -class QuantileEstimator(BaseEstimator): +class QuantileEstimator(object): """An estimator predicting the alpha-quantile of the training targets.""" def __init__(self, alpha=0.9): if not 0 < alpha < 1.0: @@ -86,7 +86,7 @@ def predict(self, X): return y -class MeanEstimator(BaseEstimator): +class MeanEstimator(object): """An estimator predicting the mean of the training targets.""" def fit(self, X, y, sample_weight=None): if sample_weight is None: @@ -102,7 +102,7 @@ def predict(self, X): return y -class LogOddsEstimator(BaseEstimator): +class LogOddsEstimator(object): """An estimator predicting the log odds ratio.""" scale = 1.0 @@ -132,7 +132,7 @@ class ScaledLogOddsEstimator(LogOddsEstimator): scale = 0.5 -class PriorProbabilityEstimator(BaseEstimator): +class PriorProbabilityEstimator(object): """An estimator predicting the probability of each class in the training data. """ @@ -150,7 +150,7 @@ def predict(self, X): return y -class ZeroEstimator(BaseEstimator): +class ZeroEstimator(object): """An estimator that simply predicts zero. """ def fit(self, X, y, sample_weight=None): diff --git a/sklearn/feature_extraction/tests/test_text.py b/sklearn/feature_extraction/tests/test_text.py index 88382f7d13c0b..341486abd3b1c 100644 --- a/sklearn/feature_extraction/tests/test_text.py +++ b/sklearn/feature_extraction/tests/test_text.py @@ -420,7 +420,7 @@ def test_vectorizer(): # test tf alone t2 = TfidfTransformer(norm='l1', use_idf=False) tf = t2.fit(counts_train).transform(counts_train).toarray() - assert_equal(t2.idf_, None) + assert_false(hasattr(t2, "idf_")) # test idf transform with unlearned idf vector t3 = TfidfTransformer(use_idf=True) diff --git a/sklearn/feature_extraction/text.py b/sklearn/feature_extraction/text.py index 2484be7166cfa..539e88973bcc0 100644 --- a/sklearn/feature_extraction/text.py +++ b/sklearn/feature_extraction/text.py @@ -29,7 +29,6 @@ from ..preprocessing import normalize from .hashing import FeatureHasher from .stop_words import ENGLISH_STOP_WORDS -from ..utils import deprecated from ..utils.fixes import frombuffer_empty, bincount from ..utils.validation import check_is_fitted @@ -1036,7 +1035,7 @@ def fit(self, X, y=None): # log+1 instead of log makes sure terms with zero idf don't get # suppressed entirely. idf = np.log(float(n_samples) / df) + 1.0 - self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, + self._idf_diag = sp.spdiags(idf, diags=0, m=n_features, n=n_features, format='csr') return self @@ -1088,10 +1087,9 @@ def transform(self, X, copy=True): @property def idf_(self): - if hasattr(self, "_idf_diag"): - return np.ravel(self._idf_diag.sum(axis=0)) - else: - return None + # if _idf_diag is not set, this will raise an attribute error, + # which means hasattr(self, "idf_") is False + return np.ravel(self._idf_diag.sum(axis=0)) class TfidfVectorizer(CountVectorizer): diff --git a/sklearn/feature_selection/from_model.py b/sklearn/feature_selection/from_model.py index 7fe4456ccd390..dada33e9a75cc 100644 --- a/sklearn/feature_selection/from_model.py +++ b/sklearn/feature_selection/from_model.py @@ -4,11 +4,12 @@ import numpy as np from .base import SelectorMixin -from ..base import BaseEstimator, clone +from ..base import BaseEstimator, clone, MetaEstimatorMixin from ..externals import six from ..exceptions import NotFittedError from ..utils.fixes import norm +from ..utils.metaestimators import if_delegate_has_method def _get_feature_importances(estimator, norm_order=1): @@ -76,7 +77,7 @@ def _calculate_threshold(estimator, importances, threshold): return threshold -class SelectFromModel(BaseEstimator, SelectorMixin): +class SelectFromModel(BaseEstimator, SelectorMixin, MetaEstimatorMixin): """Meta-transformer for selecting features based on importance weights. .. versionadded:: 0.17 @@ -121,7 +122,6 @@ class SelectFromModel(BaseEstimator, SelectorMixin): threshold_ : float The threshold value used for feature selection. """ - def __init__(self, estimator, threshold=None, prefit=False, norm_order=1): self.estimator = estimator self.threshold = threshold @@ -136,12 +136,11 @@ def _get_support_mask(self): estimator = self.estimator_ else: raise ValueError( - 'Either fit the model before transform or set "prefit=True"' - ' while passing the fitted estimator to the constructor.') + 'Either fit SelectFromModel before transform or set "prefit=' + 'True" and pass a fitted estimator to the constructor.') scores = _get_feature_importances(estimator, self.norm_order) - self.threshold_ = _calculate_threshold(estimator, scores, - self.threshold) - return scores >= self.threshold_ + threshold = _calculate_threshold(estimator, scores, self.threshold) + return scores >= threshold def fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer. @@ -169,6 +168,12 @@ def fit(self, X, y=None, **fit_params): self.estimator_.fit(X, y, **fit_params) return self + @property + def threshold_(self): + scores = _get_feature_importances(self.estimator_, self.norm_order) + return _calculate_threshold(self.estimator, scores, self.threshold) + + @if_delegate_has_method('estimator') def partial_fit(self, X, y=None, **fit_params): """Fit the SelectFromModel meta-transformer only once. diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py index d92e341676371..31ff0057d8d8e 100644 --- a/sklearn/feature_selection/rfe.py +++ b/sklearn/feature_selection/rfe.py @@ -30,6 +30,7 @@ def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer): X_train, y_train, lambda estimator, features: _score(estimator, X_test[:, features], y_test, scorer)).scores_ + class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin): """Feature ranking with recursive feature elimination. @@ -293,8 +294,8 @@ class RFECV(RFE, MetaEstimatorMixin): - An iterable yielding train/test splits. For integer/None inputs, if ``y`` is binary or multiclass, - :class:`sklearn.model_selection.StratifiedKFold` is used. If the - estimator is a classifier or if ``y`` is neither binary nor multiclass, + :class:`sklearn.model_selection.StratifiedKFold` is used. If the + estimator is a classifier or if ``y`` is neither binary nor multiclass, :class:`sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various diff --git a/sklearn/feature_selection/tests/test_from_model.py b/sklearn/feature_selection/tests/test_from_model.py index 6efb6f405bb1c..6ef0d824b587c 100644 --- a/sklearn/feature_selection/tests/test_from_model.py +++ b/sklearn/feature_selection/tests/test_from_model.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.utils.testing import assert_true +from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_less from sklearn.utils.testing import assert_greater @@ -120,6 +121,10 @@ def test_partial_fit(): transformer.fit(np.vstack((data, data)), np.concatenate((y, y))) assert_array_equal(X_transform, transformer.transform(data)) + # check that if est doesn't have partial_fit, neither does SelectFromModel + transformer = SelectFromModel(estimator=RandomForestClassifier()) + assert_false(hasattr(transformer, "partial_fit")) + def test_calling_fit_reinitializes(): est = LinearSVC(random_state=0) @@ -171,10 +176,10 @@ def test_threshold_string(): def test_threshold_without_refitting(): """Test that the threshold can be set without refitting the model.""" clf = SGDClassifier(alpha=0.1, n_iter=10, shuffle=True, random_state=0) - model = SelectFromModel(clf, threshold=0.1) + model = SelectFromModel(clf, threshold="0.1 * mean") model.fit(data, y) X_transform = model.transform(data) # Set a higher threshold to filter out more features. - model.threshold = 1.0 + model.threshold = "1.0 * mean" assert_greater(X_transform.shape[1], model.transform(data).shape[1]) diff --git a/sklearn/multiclass.py b/sklearn/multiclass.py index 8f9788e6a425c..712e8573fa469 100644 --- a/sklearn/multiclass.py +++ b/sklearn/multiclass.py @@ -46,7 +46,7 @@ from .utils import check_random_state from .utils.validation import _num_samples from .utils.validation import check_is_fitted -from .utils.validation import check_X_y +from .utils.validation import check_X_y, check_array from .utils.multiclass import (_check_partial_fit_first_call, check_classification_targets, _ovr_decision_function) @@ -176,7 +176,6 @@ class OneVsRestClassifier(BaseEstimator, ClassifierMixin, MetaEstimatorMixin): multilabel_ : boolean Whether a OneVsRestClassifier is a multilabel classifier. """ - def __init__(self, estimator, n_jobs=1): self.estimator = estimator self.n_jobs = n_jobs @@ -217,6 +216,7 @@ def fit(self, X, y): return self + @if_delegate_has_method('estimator') def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators @@ -488,8 +488,12 @@ def fit(self, X, y): self """ X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + check_classification_targets(y) self.classes_ = np.unique(y) + if len(self.classes_) == 1: + raise ValueError("OneVsOneClassifier can not be fit when only one" + " class is present.") n_classes = self.classes_.shape[0] estimators_indices = list(zip(*(Parallel(n_jobs=self.n_jobs)( delayed(_fit_ovo_binary) @@ -498,13 +502,14 @@ def fit(self, X, y): self.estimators_ = estimators_indices[0] try: - self.pairwise_indices_ = estimators_indices[1] \ - if self._pairwise else None + self.pairwise_indices_ = ( + estimators_indices[1] if self._pairwise else None) except AttributeError: self.pairwise_indices_ = None return self + @if_delegate_has_method(delegate='estimator') def partial_fit(self, X, y, classes=None): """Partially fit underlying estimators @@ -544,8 +549,8 @@ def partial_fit(self, X, y, classes=None): n_jobs=self.n_jobs)( delayed(_partial_fit_ovo_binary)( estimator, X, y, self.classes_[i], self.classes_[j]) - for estimator, (i, j) in izip( - self.estimators_, (combinations))) + for estimator, (i, j) in izip(self.estimators_, + (combinations))) self.pairwise_indices_ = None @@ -703,12 +708,14 @@ def fit(self, X, y): ------- self """ + X, y = check_X_y(X, y) if self.code_size <= 0: raise ValueError("code_size should be greater than 0, got {1}" "".format(self.code_size)) _check_estimator(self.estimator) random_state = check_random_state(self.random_state) + check_classification_targets(y) self.classes_ = np.unique(y) n_classes = self.classes_.shape[0] @@ -749,6 +756,7 @@ def predict(self, X): Predicted multi-class targets. """ check_is_fitted(self, 'estimators_') + X = check_array(X) Y = np.array([_predict_binary(e, X) for e in self.estimators_]).T pred = euclidean_distances(Y, self.code_book_).argmin(axis=1) return self.classes_[pred] diff --git a/sklearn/multioutput.py b/sklearn/multioutput.py index 826ece6d50d98..f608936e952ab 100644 --- a/sklearn/multioutput.py +++ b/sklearn/multioutput.py @@ -16,13 +16,14 @@ import numpy as np -from abc import ABCMeta -from .base import BaseEstimator, clone +from abc import ABCMeta, abstractmethod +from .base import BaseEstimator, clone, MetaEstimatorMixin from .base import RegressorMixin, ClassifierMixin from .utils import check_array, check_X_y from .utils.fixes import parallel_helper from .utils.validation import check_is_fitted, has_fit_parameter from .utils.metaestimators import if_delegate_has_method +from .utils.multiclass import check_classification_targets from .externals.joblib import Parallel, delayed from .externals import six @@ -57,8 +58,9 @@ def _partial_fit_estimator(estimator, X, y, classes=None, sample_weight=None, return estimator -class MultiOutputEstimator(six.with_metaclass(ABCMeta, BaseEstimator)): - +class MultiOutputEstimator(six.with_metaclass(ABCMeta, BaseEstimator, + MetaEstimatorMixin)): + @abstractmethod def __init__(self, estimator, n_jobs=1): self.estimator = estimator self.n_jobs = n_jobs @@ -149,6 +151,9 @@ def fit(self, X, y, sample_weight=None): multi_output=True, accept_sparse=True) + if isinstance(self, ClassifierMixin): + check_classification_targets(y) + if y.ndim == 1: raise ValueError("y must have at least two dimensions for " "multi-output regression but has only one.") diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py index 1320e2996b6e2..c351d1169f4b2 100644 --- a/sklearn/naive_bayes.py +++ b/sklearn/naive_bayes.py @@ -484,13 +484,13 @@ def partial_fit(self, X, y, classes=None, sample_weight=None): y : array-like, shape = [n_samples] Target values. - classes : array-like, shape = [n_classes], optional (default=None) + classes : array-like, shape = [n_classes] (default=None) List of all the classes that can possibly appear in the y vector. Must be provided at the first call to partial_fit, can be omitted in subsequent calls. - sample_weight : array-like, shape = [n_samples], optional (default=None) + sample_weight : array-like, shape = [n_samples] (default=None) Weights applied to individual samples (1. for unweighted). Returns @@ -555,7 +555,7 @@ def fit(self, X, y, sample_weight=None): y : array-like, shape = [n_samples] Target values. - sample_weight : array-like, shape = [n_samples], optional (default=None) + sample_weight : array-like, shape = [n_samples], (default=None) Weights applied to individual samples (1. for unweighted). Returns diff --git a/sklearn/neighbors/approximate.py b/sklearn/neighbors/approximate.py index 7f7321abdfb1c..c19cb408d643d 100644 --- a/sklearn/neighbors/approximate.py +++ b/sklearn/neighbors/approximate.py @@ -93,7 +93,7 @@ class GaussianRandomProjectionHash(ProjectionToHashMixin, GaussianRandomProjection): """Use GaussianRandomProjection to produce a cosine LSH fingerprint""" def __init__(self, - n_components=8, + n_components=32, random_state=None): super(GaussianRandomProjectionHash, self).__init__( n_components=n_components, diff --git a/sklearn/tests/test_multiclass.py b/sklearn/tests/test_multiclass.py index b62e78e87c223..20ec4b132fc7f 100644 --- a/sklearn/tests/test_multiclass.py +++ b/sklearn/tests/test_multiclass.py @@ -13,7 +13,8 @@ from sklearn.multiclass import OneVsRestClassifier from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OutputCodeClassifier -from sklearn.utils.multiclass import check_classification_targets, type_of_target +from sklearn.utils.multiclass import (check_classification_targets, + type_of_target) from sklearn.utils import shuffle from sklearn.metrics import precision_score @@ -104,6 +105,10 @@ def test_ovr_partial_fit(): pred1 = ovr1.fit(X, y).predict(X) assert_equal(np.mean(pred == y), np.mean(pred1 == y)) + # test partial_fit only exists if estimator has it: + ovr = OneVsRestClassifier(SVC()) + assert_false(hasattr(ovr, "partial_fit")) + def test_ovr_partial_fit_exceptions(): ovr = OneVsRestClassifier(MultinomialNB()) @@ -428,7 +433,8 @@ def test_ovr_pipeline(): def test_ovr_coef_(): - for base_classifier in [SVC(kernel='linear', random_state=0), LinearSVC(random_state=0)]: + for base_classifier in [SVC(kernel='linear', random_state=0), + LinearSVC(random_state=0)]: # SVC has sparse coef with sparse input data ovr = OneVsRestClassifier(base_classifier) @@ -439,7 +445,8 @@ def test_ovr_coef_(): assert_equal(shape[0], n_classes) assert_equal(shape[1], iris.data.shape[1]) # don't densify sparse coefficients - assert_equal(sp.issparse(ovr.estimators_[0].coef_), sp.issparse(ovr.coef_)) + assert_equal(sp.issparse(ovr.estimators_[0].coef_), + sp.issparse(ovr.coef_)) def test_ovr_coef_exceptions(): @@ -508,6 +515,10 @@ def test_ovo_partial_fit_predict(): assert_equal(len(ovo1.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred1), 0.65) + # test partial_fit only exists if estimator has it: + ovr = OneVsOneClassifier(SVC()) + assert_false(hasattr(ovr, "partial_fit")) + def test_ovo_decision_function(): n_samples = iris.data.shape[0] @@ -606,6 +617,24 @@ def test_ovo_string_y(): assert_array_equal(y, ovo.predict(X)) +def test_ovo_one_class(): + # Test error for OvO with one class + X = np.eye(4) + y = np.array(['a'] * 4) + + ovo = OneVsOneClassifier(LinearSVC()) + assert_raise_message(ValueError, "when only one class", ovo.fit, X, y) + + +def test_ovo_float_y(): + # Test that the OvO errors on float targets + X = iris.data + y = iris.data[:, 0] + + ovo = OneVsOneClassifier(LinearSVC()) + assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y) + + def test_ecoc_exceptions(): ecoc = OutputCodeClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ecoc.predict, []) @@ -634,6 +663,15 @@ def test_ecoc_gridsearch(): assert_true(best_C in Cs) +def test_ecoc_float_y(): + # Test that the OCC errors on float targets + X = iris.data + y = iris.data[:, 0] + + ovo = OutputCodeClassifier(LinearSVC()) + assert_raise_message(ValueError, "Unknown label type", ovo.fit, X, y) + + def test_pairwise_indices(): clf_precomputed = svm.SVC(kernel='precomputed') X, y = iris.data, iris.target diff --git a/sklearn/tests/test_multioutput.py b/sklearn/tests/test_multioutput.py index 163363155ca3d..a4217bea63a7c 100644 --- a/sklearn/tests/test_multioutput.py +++ b/sklearn/tests/test_multioutput.py @@ -6,6 +6,7 @@ from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_false from sklearn.utils.testing import assert_raises_regex +from sklearn.utils.testing import assert_raise_message from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_not_equal @@ -336,3 +337,5 @@ def test_multi_output_exceptions(): y_new = np.column_stack((y1, y2)) moc.fit(X, y) assert_raises(ValueError, moc.score, X, y_new) + # ValueError when y is continuous + assert_raise_message(ValueError, "Unknown label type", moc.fit, X, X[:, 1]) diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py index 2a2cfe1c30fbf..2d3c80510db0d 100644 --- a/sklearn/utils/multiclass.py +++ b/sklearn/utils/multiclass.py @@ -23,6 +23,7 @@ from ..utils.fixes import bincount from ..utils.fixes import array_equal + def _unique_multiclass(y): if hasattr(y, '__array__'): return np.unique(np.asarray(y)) @@ -155,6 +156,7 @@ def is_multilabel(y): return len(labels) < 3 and (y.dtype.kind in 'biu' or # bool, int, uint _is_integral_float(labels)) + def check_classification_targets(y): """Ensure that target y is of a non-regression type. @@ -168,11 +170,10 @@ def check_classification_targets(y): """ y_type = type_of_target(y) if y_type not in ['binary', 'multiclass', 'multiclass-multioutput', - 'multilabel-indicator', 'multilabel-sequences']: + 'multilabel-indicator', 'multilabel-sequences']: raise ValueError("Unknown label type: %r" % y_type) - def type_of_target(y): """Determine the type of data indicated by target `y`