diff --git a/sklearn/linear_model/tests/test_coordinate_descent.py b/sklearn/linear_model/tests/test_coordinate_descent.py index a3b35f40a88d7..fb65d800e78ba 100644 --- a/sklearn/linear_model/tests/test_coordinate_descent.py +++ b/sklearn/linear_model/tests/test_coordinate_descent.py @@ -706,17 +706,17 @@ def test_overrided_gram_matrix(): clf.fit, X, y) -def test_lasso_non_float_y(): +@pytest.mark.parametrize('model', [ElasticNet, Lasso]) +def test_lasso_non_float_y(model): X = [[0, 0], [1, 1], [-1, -1]] y = [0, 1, 2] y_float = [0.0, 1.0, 2.0] - for model in [ElasticNet, Lasso]: - clf = model(fit_intercept=False) - clf.fit(X, y) - clf_float = model(fit_intercept=False) - clf_float.fit(X, y_float) - assert_array_equal(clf.coef_, clf_float.coef_) + clf = model(fit_intercept=False) + clf.fit(X, y) + clf_float = model(fit_intercept=False) + clf_float.fit(X, y_float) + assert_array_equal(clf.coef_, clf_float.coef_) def test_enet_float_precision(): diff --git a/sklearn/linear_model/tests/test_least_angle.py b/sklearn/linear_model/tests/test_least_angle.py index e41df9cce1178..630559fe4fef2 100644 --- a/sklearn/linear_model/tests/test_least_angle.py +++ b/sklearn/linear_model/tests/test_least_angle.py @@ -3,6 +3,8 @@ import numpy as np from scipy import linalg +import pytest + from sklearn.model_selection import train_test_split from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_almost_equal @@ -172,18 +174,20 @@ def test_no_path_all_precomputed(): assert_true(alpha_ == alphas_[-1]) -def test_lars_precompute(): +@pytest.mark.parametrize( + 'classifier', + [linear_model.Lars, linear_model.LarsCV, linear_model.LassoLarsIC]) +def test_lars_precompute(classifier): # Check for different values of precompute X, y = diabetes.data, diabetes.target G = np.dot(X.T, X) - for classifier in [linear_model.Lars, linear_model.LarsCV, - linear_model.LassoLarsIC]: - clf = classifier(precompute=G) - output_1 = ignore_warnings(clf.fit)(X, y).coef_ - for precompute in [True, False, 'auto', None]: - clf = classifier(precompute=precompute) - output_2 = clf.fit(X, y).coef_ - assert_array_almost_equal(output_1, output_2, decimal=8) + + clf = classifier(precompute=G) + output_1 = ignore_warnings(clf.fit)(X, y).coef_ + for precompute in [True, False, 'auto', None]: + clf = classifier(precompute=precompute) + output_2 = clf.fit(X, y).coef_ + assert_array_almost_equal(output_1, output_2, decimal=8) def test_singular_matrix(): diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py index a179c89e199a3..e363fed2abb9d 100644 --- a/sklearn/linear_model/tests/test_logistic.py +++ b/sklearn/linear_model/tests/test_logistic.py @@ -1,6 +1,9 @@ import numpy as np import scipy.sparse as sp from scipy import linalg, optimize, sparse + +import pytest + from sklearn.datasets import load_iris, make_classification from sklearn.metrics import log_loss from sklearn.model_selection import StratifiedKFold @@ -139,63 +142,63 @@ def test_predict_iris(): assert_greater(np.mean(pred == target), .95) -def test_multinomial_validation(): - for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']: - lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial') - assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1]) +@pytest.mark.parametrize('solver', ['lbfgs', 'newton-cg', 'sag', 'saga']) +def test_multinomial_validation(solver): + lr = LogisticRegression(C=-1, solver=solver, multi_class='multinomial') + assert_raises(ValueError, lr.fit, [[0, 1], [1, 0]], [0, 1]) -def test_check_solver_option(): +@pytest.mark.parametrize('LR', [LogisticRegression, LogisticRegressionCV]) +def test_check_solver_option(LR): X, y = iris.data, iris.target - for LR in [LogisticRegression, LogisticRegressionCV]: - msg = ('Logistic Regression supports only liblinear, newton-cg, ' - 'lbfgs, sag and saga solvers, got wrong_name') - lr = LR(solver="wrong_name") + msg = ('Logistic Regression supports only liblinear, newton-cg, ' + 'lbfgs, sag and saga solvers, got wrong_name') + lr = LR(solver="wrong_name") + assert_raise_message(ValueError, msg, lr.fit, X, y) + + msg = "multi_class should be either multinomial or ovr, got wrong_name" + lr = LR(solver='newton-cg', multi_class="wrong_name") + assert_raise_message(ValueError, msg, lr.fit, X, y) + + # only 'liblinear' solver + msg = "Solver liblinear does not support a multinomial backend." + lr = LR(solver='liblinear', multi_class='multinomial') + assert_raise_message(ValueError, msg, lr.fit, X, y) + + # all solvers except 'liblinear' + for solver in ['newton-cg', 'lbfgs', 'sag']: + msg = ("Solver %s supports only l2 penalties, got l1 penalty." % + solver) + lr = LR(solver=solver, penalty='l1') assert_raise_message(ValueError, msg, lr.fit, X, y) - - msg = "multi_class should be either multinomial or ovr, got wrong_name" - lr = LR(solver='newton-cg', multi_class="wrong_name") + for solver in ['newton-cg', 'lbfgs', 'sag', 'saga']: + msg = ("Solver %s supports only dual=False, got dual=True" % + solver) + lr = LR(solver=solver, dual=True) assert_raise_message(ValueError, msg, lr.fit, X, y) - # only 'liblinear' solver - msg = "Solver liblinear does not support a multinomial backend." - lr = LR(solver='liblinear', multi_class='multinomial') - assert_raise_message(ValueError, msg, lr.fit, X, y) - # all solvers except 'liblinear' - for solver in ['newton-cg', 'lbfgs', 'sag']: - msg = ("Solver %s supports only l2 penalties, got l1 penalty." % - solver) - lr = LR(solver=solver, penalty='l1') - assert_raise_message(ValueError, msg, lr.fit, X, y) - for solver in ['newton-cg', 'lbfgs', 'sag', 'saga']: - msg = ("Solver %s supports only dual=False, got dual=True" % - solver) - lr = LR(solver=solver, dual=True) - assert_raise_message(ValueError, msg, lr.fit, X, y) - - -def test_multinomial_binary(): +@pytest.mark.parametrize('solver', ['lbfgs', 'newton-cg', 'sag', 'saga']) +def test_multinomial_binary(solver): # Test multinomial LR on a binary problem. target = (iris.target > 0).astype(np.intp) target = np.array(["setosa", "not-setosa"])[target] - for solver in ['lbfgs', 'newton-cg', 'sag', 'saga']: - clf = LogisticRegression(solver=solver, multi_class='multinomial', - random_state=42, max_iter=2000) - clf.fit(iris.data, target) + clf = LogisticRegression(solver=solver, multi_class='multinomial', + random_state=42, max_iter=2000) + clf.fit(iris.data, target) - assert_equal(clf.coef_.shape, (1, iris.data.shape[1])) - assert_equal(clf.intercept_.shape, (1,)) - assert_array_equal(clf.predict(iris.data), target) + assert_equal(clf.coef_.shape, (1, iris.data.shape[1])) + assert_equal(clf.intercept_.shape, (1,)) + assert_array_equal(clf.predict(iris.data), target) - mlr = LogisticRegression(solver=solver, multi_class='multinomial', - random_state=42, fit_intercept=False) - mlr.fit(iris.data, target) - pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), - axis=1)] - assert_greater(np.mean(pred == target), .9) + mlr = LogisticRegression(solver=solver, multi_class='multinomial', + random_state=42, fit_intercept=False) + mlr.fit(iris.data, target) + pred = clf.classes_[np.argmax(clf.predict_log_proba(iris.data), + axis=1)] + assert_greater(np.mean(pred == target), .9) def test_multinomial_binary_probabilities(): @@ -1043,7 +1046,9 @@ def test_max_iter(): assert_equal(lr.n_iter_[0], max_iter) -def test_n_iter(): +@pytest.mark.parametrize('solver', + ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']) +def test_n_iter(solver): # Test that self.n_iter_ has the correct format. X, y = iris.data, iris.target y_bin = y.copy() @@ -1052,76 +1057,73 @@ def test_n_iter(): n_Cs = 4 n_cv_fold = 2 - for solver in ['newton-cg', 'liblinear', 'sag', 'saga', 'lbfgs']: - # OvR case - n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0] - clf = LogisticRegression(tol=1e-2, multi_class='ovr', - solver=solver, C=1., - random_state=42, max_iter=100) - clf.fit(X, y) - assert_equal(clf.n_iter_.shape, (n_classes,)) + # OvR case + n_classes = 1 if solver == 'liblinear' else np.unique(y).shape[0] + clf = LogisticRegression(tol=1e-2, multi_class='ovr', + solver=solver, C=1., + random_state=42, max_iter=100) + clf.fit(X, y) + assert_equal(clf.n_iter_.shape, (n_classes,)) - n_classes = np.unique(y).shape[0] - clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr', - solver=solver, Cs=n_Cs, cv=n_cv_fold, - random_state=42, max_iter=100) - clf.fit(X, y) - assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) - clf.fit(X, y_bin) - assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) - - # multinomial case - n_classes = 1 - if solver in ('liblinear', 'sag', 'saga'): - break - - clf = LogisticRegression(tol=1e-2, multi_class='multinomial', - solver=solver, C=1., - random_state=42, max_iter=100) - clf.fit(X, y) - assert_equal(clf.n_iter_.shape, (n_classes,)) + n_classes = np.unique(y).shape[0] + clf = LogisticRegressionCV(tol=1e-2, multi_class='ovr', + solver=solver, Cs=n_Cs, cv=n_cv_fold, + random_state=42, max_iter=100) + clf.fit(X, y) + assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) + clf.fit(X, y_bin) + assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) + + # multinomial case + n_classes = 1 + if solver in ('liblinear', 'sag', 'saga'): + return + + clf = LogisticRegression(tol=1e-2, multi_class='multinomial', + solver=solver, C=1., + random_state=42, max_iter=100) + clf.fit(X, y) + assert_equal(clf.n_iter_.shape, (n_classes,)) - clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial', - solver=solver, Cs=n_Cs, cv=n_cv_fold, - random_state=42, max_iter=100) - clf.fit(X, y) - assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) - clf.fit(X, y_bin) - assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) + clf = LogisticRegressionCV(tol=1e-2, multi_class='multinomial', + solver=solver, Cs=n_Cs, cv=n_cv_fold, + random_state=42, max_iter=100) + clf.fit(X, y) + assert_equal(clf.n_iter_.shape, (n_classes, n_cv_fold, n_Cs)) + clf.fit(X, y_bin) + assert_equal(clf.n_iter_.shape, (1, n_cv_fold, n_Cs)) -def test_warm_start(): +@pytest.mark.parametrize('solver', ('newton-cg', 'sag', 'saga', 'lbfgs')) +@pytest.mark.parametrize('warm_start', (True, False)) +@pytest.mark.parametrize('fit_intercept', (True, False)) +@pytest.mark.parametrize('multi_class', ['ovr', 'multinomial']) +def test_warm_start(solver, warm_start, fit_intercept, multi_class): # A 1-iteration second fit on same data should give almost same result # with warm starting, and quite different result without warm starting. # Warm starting does not work with liblinear solver. X, y = iris.data, iris.target - solvers = ['newton-cg', 'sag', 'saga', 'lbfgs'] - - for warm_start in [True, False]: - for fit_intercept in [True, False]: - for solver in solvers: - for multi_class in ['ovr', 'multinomial']: - clf = LogisticRegression(tol=1e-4, multi_class=multi_class, - warm_start=warm_start, - solver=solver, - random_state=42, max_iter=100, - fit_intercept=fit_intercept) - with ignore_warnings(category=ConvergenceWarning): - clf.fit(X, y) - coef_1 = clf.coef_ - - clf.max_iter = 1 - clf.fit(X, y) - cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) - msg = ("Warm starting issue with %s solver in %s mode " - "with fit_intercept=%s and warm_start=%s" - % (solver, multi_class, str(fit_intercept), - str(warm_start))) - if warm_start: - assert_greater(2.0, cum_diff, msg) - else: - assert_greater(cum_diff, 2.0, msg) + clf = LogisticRegression(tol=1e-4, multi_class=multi_class, + warm_start=warm_start, + solver=solver, + random_state=42, max_iter=100, + fit_intercept=fit_intercept) + with ignore_warnings(category=ConvergenceWarning): + clf.fit(X, y) + coef_1 = clf.coef_ + + clf.max_iter = 1 + clf.fit(X, y) + cum_diff = np.sum(np.abs(coef_1 - clf.coef_)) + msg = ("Warm starting issue with %s solver in %s mode " + "with fit_intercept=%s and warm_start=%s" + % (solver, multi_class, str(fit_intercept), + str(warm_start))) + if warm_start: + assert_greater(2.0, cum_diff, msg) + else: + assert_greater(cum_diff, 2.0, msg) def test_saga_vs_liblinear(): diff --git a/sklearn/linear_model/tests/test_passive_aggressive.py b/sklearn/linear_model/tests/test_passive_aggressive.py index 5620c29e18374..ee519b7390c5b 100644 --- a/sklearn/linear_model/tests/test_passive_aggressive.py +++ b/sklearn/linear_model/tests/test_passive_aggressive.py @@ -2,6 +2,8 @@ import numpy as np import scipy.sparse as sp +import pytest + from sklearn.utils.testing import assert_less from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal, assert_array_equal @@ -111,23 +113,22 @@ def test_classifier_refit(): assert_array_equal(clf.classes_, iris.target_names) -def test_classifier_correctness(): +@pytest.mark.parametrize('loss', ("hinge", "squared_hinge")) +def test_classifier_correctness(loss): y_bin = y.copy() y_bin[y != 1] = -1 - for loss in ("hinge", "squared_hinge"): - - clf1 = MyPassiveAggressive( - C=1.0, loss=loss, fit_intercept=True, n_iter=2) - clf1.fit(X, y_bin) + clf1 = MyPassiveAggressive( + C=1.0, loss=loss, fit_intercept=True, n_iter=2) + clf1.fit(X, y_bin) - for data in (X, X_csr): - clf2 = PassiveAggressiveClassifier( - C=1.0, loss=loss, fit_intercept=True, max_iter=2, - shuffle=False, tol=None) - clf2.fit(data, y_bin) + for data in (X, X_csr): + clf2 = PassiveAggressiveClassifier( + C=1.0, loss=loss, fit_intercept=True, max_iter=2, + shuffle=False, tol=None) + clf2.fit(data, y_bin) - assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2) + assert_array_almost_equal(clf1.w, clf2.coef_.ravel(), decimal=2) def test_classifier_undefined_methods(): @@ -248,22 +249,24 @@ def test_regressor_partial_fit(): assert_true(hasattr(reg, 'standard_coef_')) -def test_regressor_correctness(): +@pytest.mark.parametrize( + 'loss', + ("epsilon_insensitive", "squared_epsilon_insensitive")) +def test_regressor_correctness(loss): y_bin = y.copy() y_bin[y != 1] = -1 - for loss in ("epsilon_insensitive", "squared_epsilon_insensitive"): - reg1 = MyPassiveAggressive( - C=1.0, loss=loss, fit_intercept=True, n_iter=2) - reg1.fit(X, y_bin) + reg1 = MyPassiveAggressive( + C=1.0, loss=loss, fit_intercept=True, n_iter=2) + reg1.fit(X, y_bin) - for data in (X, X_csr): - reg2 = PassiveAggressiveRegressor( - C=1.0, tol=None, loss=loss, fit_intercept=True, max_iter=2, - shuffle=False) - reg2.fit(data, y_bin) + for data in (X, X_csr): + reg2 = PassiveAggressiveRegressor( + C=1.0, tol=None, loss=loss, fit_intercept=True, max_iter=2, + shuffle=False) + reg2.fit(data, y_bin) - assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2) + assert_array_almost_equal(reg1.w, reg2.coef_.ravel(), decimal=2) def test_regressor_undefined_methods(): diff --git a/sklearn/linear_model/tests/test_ridge.py b/sklearn/linear_model/tests/test_ridge.py index a2f2a135b3ae4..2f574b88ba7b5 100644 --- a/sklearn/linear_model/tests/test_ridge.py +++ b/sklearn/linear_model/tests/test_ridge.py @@ -3,6 +3,8 @@ from scipy import linalg from itertools import product +import pytest + from sklearn.utils.testing import assert_true from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_almost_equal @@ -57,41 +59,42 @@ SPARSE_FILTER = lambda X: sp.csr_matrix(X) -def test_ridge(): +@pytest.mark.parametrize('solver', + ("svd", "sparse_cg", "cholesky", "lsqr", "sag")) +def test_ridge(solver): # Ridge regression convergence test using score # TODO: for this test to be robust, we should use a dataset instead # of np.random. rng = np.random.RandomState(0) alpha = 1.0 - for solver in ("svd", "sparse_cg", "cholesky", "lsqr", "sag"): - # With more samples than features - n_samples, n_features = 6, 5 - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) + # With more samples than features + n_samples, n_features = 6, 5 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) - ridge = Ridge(alpha=alpha, solver=solver) - ridge.fit(X, y) - assert_equal(ridge.coef_.shape, (X.shape[1], )) - assert_greater(ridge.score(X, y), 0.47) + ridge = Ridge(alpha=alpha, solver=solver) + ridge.fit(X, y) + assert_equal(ridge.coef_.shape, (X.shape[1], )) + assert_greater(ridge.score(X, y), 0.47) - if solver in ("cholesky", "sag"): - # Currently the only solvers to support sample_weight. - ridge.fit(X, y, sample_weight=np.ones(n_samples)) - assert_greater(ridge.score(X, y), 0.47) + if solver in ("cholesky", "sag"): + # Currently the only solvers to support sample_weight. + ridge.fit(X, y, sample_weight=np.ones(n_samples)) + assert_greater(ridge.score(X, y), 0.47) - # With more features than samples - n_samples, n_features = 5, 10 - y = rng.randn(n_samples) - X = rng.randn(n_samples, n_features) - ridge = Ridge(alpha=alpha, solver=solver) - ridge.fit(X, y) - assert_greater(ridge.score(X, y), .9) + # With more features than samples + n_samples, n_features = 5, 10 + y = rng.randn(n_samples) + X = rng.randn(n_samples, n_features) + ridge = Ridge(alpha=alpha, solver=solver) + ridge.fit(X, y) + assert_greater(ridge.score(X, y), .9) - if solver in ("cholesky", "sag"): - # Currently the only solvers to support sample_weight. - ridge.fit(X, y, sample_weight=np.ones(n_samples)) - assert_greater(ridge.score(X, y), 0.9) + if solver in ("cholesky", "sag"): + # Currently the only solvers to support sample_weight. + ridge.fit(X, y, sample_weight=np.ones(n_samples)) + assert_greater(ridge.score(X, y), 0.9) def test_primal_dual_relationship(): @@ -153,6 +156,8 @@ def test_ridge_regression_convergence_fail(): def test_ridge_sample_weights(): # TODO: loop over sparse data as well + # Note: parametrizing this test with pytest results in failed + # assertions, meaning that is is not extremely robust rng = np.random.RandomState(0) param_grid = product((1.0, 1e-2), (True, False), @@ -483,15 +488,13 @@ def check_dense_sparse(test_func): assert_array_almost_equal(ret_dense, ret_sparse, decimal=3) -def test_dense_sparse(): - for test_func in (_test_ridge_loo, - _test_ridge_cv, - _test_ridge_cv_normalize, - _test_ridge_diabetes, - _test_multi_ridge_diabetes, - _test_ridge_classifiers, - _test_tolerance): - yield check_dense_sparse, test_func +@pytest.mark.parametrize( + 'test_func', + (_test_ridge_loo, _test_ridge_cv, _test_ridge_cv_normalize, + _test_ridge_diabetes, _test_multi_ridge_diabetes, + _test_ridge_classifiers, _test_tolerance)) +def test_dense_sparse(test_func): + check_dense_sparse(test_func) def test_ridge_cv_sparse_svd(): @@ -543,33 +546,33 @@ def test_class_weights(): assert_array_almost_equal(reg.intercept_, rega.intercept_) -def test_class_weight_vs_sample_weight(): +@pytest.mark.parametrize('reg', (RidgeClassifier, RidgeClassifierCV)) +def test_class_weight_vs_sample_weight(reg): """Check class_weights resemble sample_weights behavior.""" - for reg in (RidgeClassifier, RidgeClassifierCV): - - # Iris is balanced, so no effect expected for using 'balanced' weights - reg1 = reg() - reg1.fit(iris.data, iris.target) - reg2 = reg(class_weight='balanced') - reg2.fit(iris.data, iris.target) - assert_almost_equal(reg1.coef_, reg2.coef_) - - # Inflate importance of class 1, check against user-defined weights - sample_weight = np.ones(iris.target.shape) - sample_weight[iris.target == 1] *= 100 - class_weight = {0: 1., 1: 100., 2: 1.} - reg1 = reg() - reg1.fit(iris.data, iris.target, sample_weight) - reg2 = reg(class_weight=class_weight) - reg2.fit(iris.data, iris.target) - assert_almost_equal(reg1.coef_, reg2.coef_) - - # Check that sample_weight and class_weight are multiplicative - reg1 = reg() - reg1.fit(iris.data, iris.target, sample_weight ** 2) - reg2 = reg(class_weight=class_weight) - reg2.fit(iris.data, iris.target, sample_weight) - assert_almost_equal(reg1.coef_, reg2.coef_) + + # Iris is balanced, so no effect expected for using 'balanced' weights + reg1 = reg() + reg1.fit(iris.data, iris.target) + reg2 = reg(class_weight='balanced') + reg2.fit(iris.data, iris.target) + assert_almost_equal(reg1.coef_, reg2.coef_) + + # Inflate importance of class 1, check against user-defined weights + sample_weight = np.ones(iris.target.shape) + sample_weight[iris.target == 1] *= 100 + class_weight = {0: 1., 1: 100., 2: 1.} + reg1 = reg() + reg1.fit(iris.data, iris.target, sample_weight) + reg2 = reg(class_weight=class_weight) + reg2.fit(iris.data, iris.target) + assert_almost_equal(reg1.coef_, reg2.coef_) + + # Check that sample_weight and class_weight are multiplicative + reg1 = reg() + reg1.fit(iris.data, iris.target, sample_weight ** 2) + reg2 = reg(class_weight=class_weight) + reg2.fit(iris.data, iris.target, sample_weight) + assert_almost_equal(reg1.coef_, reg2.coef_) def test_class_weights_cv(): diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py index 9f372f706ca71..18bc073139650 100644 --- a/sklearn/linear_model/tests/test_sgd.py +++ b/sklearn/linear_model/tests/test_sgd.py @@ -1174,16 +1174,16 @@ def test_numerical_stability_large_gradient(): assert_true(np.isfinite(model.coef_).all()) -def test_large_regularization(): +@pytest.mark.parametrize('penalty', ['l2', 'l1', 'elasticnet']) +def test_large_regularization(penalty): # Non regression tests for numerical stability issues caused by large # regularization parameters - for penalty in ['l2', 'l1', 'elasticnet']: - model = SGDClassifier(alpha=1e5, learning_rate='constant', eta0=0.1, - penalty=penalty, shuffle=False, - tol=None, max_iter=6) - with np.errstate(all='raise'): - model.fit(iris.data, iris.target) - assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_)) + model = SGDClassifier(alpha=1e5, learning_rate='constant', eta0=0.1, + penalty=penalty, shuffle=False, + tol=None, max_iter=6) + with np.errstate(all='raise'): + model.fit(iris.data, iris.target) + assert_array_almost_equal(model.coef_, np.zeros_like(model.coef_)) def test_tol_parameter(): diff --git a/sklearn/manifold/tests/test_t_sne.py b/sklearn/manifold/tests/test_t_sne.py index 6b1d87bb18bf5..cc692ae0d0cd0 100644 --- a/sklearn/manifold/tests/test_t_sne.py +++ b/sklearn/manifold/tests/test_t_sne.py @@ -3,6 +3,8 @@ import numpy as np import scipy.sparse as sp +import pytest + from sklearn.neighbors import BallTree from sklearn.neighbors import NearestNeighbors from sklearn.utils.testing import assert_less_equal @@ -596,35 +598,35 @@ def test_no_sparse_on_barnes_hut(): tsne.fit_transform, X_csr) -def test_64bit(): +@pytest.mark.parametrize('method', ['barnes_hut', 'exact']) +@pytest.mark.parametrize('dt', [np.float32, np.float64]) +def test_64bit(method, dt): # Ensure 64bit arrays are handled correctly. random_state = check_random_state(0) - methods = ['barnes_hut', 'exact'] - for method in methods: - for dt in [np.float32, np.float64]: - X = random_state.randn(50, 2).astype(dt) - tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, - random_state=0, method=method, verbose=0) - X_embedded = tsne.fit_transform(X) - effective_type = X_embedded.dtype - # tsne cython code is only single precision, so the output will - # always be single precision, irrespectively of the input dtype - assert effective_type == np.float32 + X = random_state.randn(50, 2).astype(dt) + tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, + random_state=0, method=method, verbose=0) + X_embedded = tsne.fit_transform(X) + effective_type = X_embedded.dtype + # tsne cython code is only single precision, so the output will + # always be single precision, irrespectively of the input dtype + assert effective_type == np.float32 -def test_kl_divergence_not_nan(): + +@pytest.mark.parametrize('method', ['barnes_hut', 'exact']) +def test_kl_divergence_not_nan(method): # Ensure kl_divergence_ is computed at last iteration # even though n_iter % n_iter_check != 0, i.e. 1003 % 50 != 0 random_state = check_random_state(0) - methods = ['barnes_hut', 'exact'] - for method in methods: - X = random_state.randn(50, 2) - tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, - random_state=0, method=method, verbose=0, n_iter=1003) - tsne.fit_transform(X) - assert not np.isnan(tsne.kl_divergence_) + X = random_state.randn(50, 2) + tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0, + random_state=0, method=method, verbose=0, n_iter=1003) + tsne.fit_transform(X) + + assert not np.isnan(tsne.kl_divergence_) def test_barnes_hut_angle(): @@ -807,9 +809,9 @@ def assert_uniform_grid(Y, try_name=None): assert_less(largest_to_mean, 2, msg=try_name) -def test_uniform_grid(): - for method in ['barnes_hut', 'exact']: - yield check_uniform_grid, method +@pytest.mark.parametrize('method', ['barnes_hut', 'exact']) +def test_uniform_grid(method): + check_uniform_grid(method) def test_bh_match_exact(): diff --git a/sklearn/metrics/cluster/tests/test_common.py b/sklearn/metrics/cluster/tests/test_common.py index 71534380fe6ec..a7e54d22cc7c8 100644 --- a/sklearn/metrics/cluster/tests/test_common.py +++ b/sklearn/metrics/cluster/tests/test_common.py @@ -101,10 +101,7 @@ def test_non_symmetry(metric_name, y1, y2): assert metric(y1, y2) != pytest.approx(metric(y2, y1)) -@pytest.mark.parametrize( - "metric_name", - [name for name in NORMALIZED_METRICS] -) +@pytest.mark.parametrize("metric_name", NORMALIZED_METRICS) def test_normalized_output(metric_name): upper_bound_1 = [0, 0, 0, 1, 1, 1] upper_bound_2 = [0, 0, 0, 1, 1, 1] @@ -126,7 +123,7 @@ def test_normalized_output(metric_name): # that is when 0 and 1 exchanged. @pytest.mark.parametrize( "metric_name", - [name for name in dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)] + dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS) ) def test_permute_labels(metric_name): y_label = np.array([0, 0, 0, 1, 1, 0, 1]) @@ -147,7 +144,7 @@ def test_permute_labels(metric_name): # For all clustering metrics Input parameters can be both @pytest.mark.parametrize( "metric_name", - [name for name in dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS)] + dict(SUPERVISED_METRICS, **UNSUPERVISED_METRICS) ) # in the form of arrays lists, positive, negetive or string def test_format_invariance(metric_name): diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py index 1705510cc1ea4..923f60994dac1 100644 --- a/sklearn/metrics/tests/test_classification.py +++ b/sklearn/metrics/tests/test_classification.py @@ -6,6 +6,8 @@ from itertools import product import warnings +import pytest + from sklearn import datasets from sklearn import svm @@ -520,7 +522,8 @@ def test_matthews_corrcoef_multiclass(): assert_almost_equal(mcc, 0.) -def test_matthews_corrcoef_overflow(): +@pytest.mark.parametrize('n_points', [100, 10000, 1000000]) +def test_matthews_corrcoef_overflow(n_points): # https://github.com/scikit-learn/scikit-learn/issues/9622 rng = np.random.RandomState(20170906) @@ -543,16 +546,15 @@ def random_ys(n_points): # binary y_pred = (x_pred > 0.5) return y_true, y_pred - for n_points in [100, 10000, 1000000]: - arr = np.repeat([0., 1.], n_points) # binary - assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) - arr = np.repeat([0., 1., 2.], n_points) # multiclass - assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) + arr = np.repeat([0., 1.], n_points) # binary + assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) + arr = np.repeat([0., 1., 2.], n_points) # multiclass + assert_almost_equal(matthews_corrcoef(arr, arr), 1.0) - y_true, y_pred = random_ys(n_points) - assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) - assert_almost_equal(matthews_corrcoef(y_true, y_pred), - mcc_safe(y_true, y_pred)) + y_true, y_pred = random_ys(n_points) + assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0) + assert_almost_equal(matthews_corrcoef(y_true, y_pred), + mcc_safe(y_true, y_pred)) def test_precision_recall_f1_score_multiclass(): @@ -610,18 +612,19 @@ def test_precision_recall_f1_score_multiclass(): assert_array_equal(s, [24, 20, 31]) -def test_precision_refcall_f1_score_multilabel_unordered_labels(): +@pytest.mark.parametrize('average', + ['samples', 'micro', 'macro', 'weighted', None]) +def test_precision_refcall_f1_score_multilabel_unordered_labels(average): # test that labels need not be sorted in the multilabel case y_true = np.array([[1, 1, 0, 0]]) y_pred = np.array([[0, 0, 1, 1]]) - for average in ['samples', 'micro', 'macro', 'weighted', None]: - p, r, f, s = precision_recall_fscore_support( - y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average) - assert_array_equal(p, 0) - assert_array_equal(r, 0) - assert_array_equal(f, 0) - if average is None: - assert_array_equal(s, [0, 1, 1, 0]) + p, r, f, s = precision_recall_fscore_support( + y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average) + assert_array_equal(p, 0) + assert_array_equal(r, 0) + assert_array_equal(f, 0) + if average is None: + assert_array_equal(s, [0, 1, 1, 0]) def test_precision_recall_f1_score_binary_averaged(): @@ -1207,10 +1210,33 @@ def test_precision_recall_f1_score_with_an_empty_prediction(): 0.333, 2) -def test_precision_recall_f1_no_labels(): +@pytest.mark.parametrize('beta', [1]) +@pytest.mark.parametrize('average', ["macro", "micro", "weighted", "samples"]) +def test_precision_recall_f1_no_labels(beta, average): + y_true = np.zeros((20, 3)) + y_pred = np.zeros_like(y_true) + + p, r, f, s = assert_warns(UndefinedMetricWarning, + precision_recall_fscore_support, + y_true, y_pred, average=average, + beta=beta) + assert_almost_equal(p, 0) + assert_almost_equal(r, 0) + assert_almost_equal(f, 0) + assert_equal(s, None) + + fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, + y_true, y_pred, + beta=beta, average=average) + assert_almost_equal(fbeta, 0) + + +def test_precision_recall_f1_no_labels_average_none(): y_true = np.zeros((20, 3)) y_pred = np.zeros_like(y_true) + beta = 1 + # tp = [0, 0, 0] # fn = [0, 0, 0] # fp = [0, 0, 0] @@ -1219,33 +1245,17 @@ def test_precision_recall_f1_no_labels(): # |y_i| = [0, 0, 0] # |y_hat_i| = [0, 0, 0] - for beta in [1]: - p, r, f, s = assert_warns(UndefinedMetricWarning, - precision_recall_fscore_support, - y_true, y_pred, average=None, beta=beta) - assert_array_almost_equal(p, [0, 0, 0], 2) - assert_array_almost_equal(r, [0, 0, 0], 2) - assert_array_almost_equal(f, [0, 0, 0], 2) - assert_array_almost_equal(s, [0, 0, 0], 2) - - fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, - y_true, y_pred, beta=beta, average=None) - assert_array_almost_equal(fbeta, [0, 0, 0], 2) - - for average in ["macro", "micro", "weighted", "samples"]: - p, r, f, s = assert_warns(UndefinedMetricWarning, - precision_recall_fscore_support, - y_true, y_pred, average=average, - beta=beta) - assert_almost_equal(p, 0) - assert_almost_equal(r, 0) - assert_almost_equal(f, 0) - assert_equal(s, None) - - fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, - y_true, y_pred, - beta=beta, average=average) - assert_almost_equal(fbeta, 0) + p, r, f, s = assert_warns(UndefinedMetricWarning, + precision_recall_fscore_support, + y_true, y_pred, average=None, beta=beta) + assert_array_almost_equal(p, [0, 0, 0], 2) + assert_array_almost_equal(r, [0, 0, 0], 2) + assert_array_almost_equal(f, [0, 0, 0], 2) + assert_array_almost_equal(s, [0, 0, 0], 2) + + fbeta = assert_warns(UndefinedMetricWarning, fbeta_score, + y_true, y_pred, beta=beta, average=None) + assert_array_almost_equal(fbeta, [0, 0, 0], 2) def test_prf_warnings(): diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py index 680b78c3dd43d..f835fdd507761 100644 --- a/sklearn/metrics/tests/test_common.py +++ b/sklearn/metrics/tests/test_common.py @@ -2,10 +2,13 @@ from functools import partial from itertools import product +from itertools import chain import numpy as np import scipy.sparse as sp +import pytest + from sklearn.datasets import make_multilabel_classification from sklearn.preprocessing import LabelBinarizer from sklearn.utils.multiclass import type_of_target @@ -193,7 +196,7 @@ # is already written. # Those metrics don't support binary inputs -METRIC_UNDEFINED_BINARY = [ +METRIC_UNDEFINED_BINARY = { "samples_f0.5_score", "samples_f1_score", "samples_f2_score", @@ -209,10 +212,10 @@ "label_ranking_loss", "label_ranking_average_precision_score", -] +} # Those metrics don't support multiclass inputs -METRIC_UNDEFINED_MULTICLASS = [ +METRIC_UNDEFINED_MULTICLASS = { "brier_score_loss", "balanced_accuracy_score", @@ -229,24 +232,24 @@ "f1_score", "f2_score", "f0.5_score", -] +} # Metric undefined with "binary" or "multiclass" input -METRIC_UNDEFINED_BINARY_MULTICLASS = set(METRIC_UNDEFINED_BINARY).union( - set(METRIC_UNDEFINED_MULTICLASS)) +METRIC_UNDEFINED_BINARY_MULTICLASS = METRIC_UNDEFINED_BINARY.union( + METRIC_UNDEFINED_MULTICLASS) # Metrics with an "average" argument -METRICS_WITH_AVERAGING = [ +METRICS_WITH_AVERAGING = { "precision_score", "recall_score", "f1_score", "f2_score", "f0.5_score" -] +} # Threshold-based metrics with an "average" argument -THRESHOLDED_METRICS_WITH_AVERAGING = [ +THRESHOLDED_METRICS_WITH_AVERAGING = { "roc_auc_score", "average_precision_score", "partial_roc_auc", -] +} # Metrics with a "pos_label" argument -METRICS_WITH_POS_LABEL = [ +METRICS_WITH_POS_LABEL = { "roc_curve", "brier_score_loss", @@ -262,12 +265,12 @@ "macro_f0.5_score", "macro_f1_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", -] +} # Metrics with a "labels" argument # TODO: Handle multi_class metrics that has a labels argument as well as a # decision function argument. e.g hinge_loss -METRICS_WITH_LABELS = [ +METRICS_WITH_LABELS = { "confusion_matrix", "hamming_loss", @@ -284,17 +287,17 @@ "macro_precision_score", "macro_recall_score", "cohen_kappa_score", -] +} # Metrics with a "normalize" option -METRICS_WITH_NORMALIZE_OPTION = [ +METRICS_WITH_NORMALIZE_OPTION = { "accuracy_score", "jaccard_similarity_score", "zero_one_loss", -] +} # Threshold-based metrics with "multilabel-indicator" format support -THRESHOLDED_MULTILABEL_METRICS = [ +THRESHOLDED_MULTILABEL_METRICS = { "log_loss", "unnormalized_log_loss", @@ -307,10 +310,10 @@ "coverage_error", "label_ranking_loss", "label_ranking_average_precision_score", -] +} # Classification metrics with "multilabel-indicator" format -MULTILABELS_METRICS = [ +MULTILABELS_METRICS = { "accuracy_score", "unnormalized_accuracy_score", "hamming_loss", "jaccard_similarity_score", "unnormalized_jaccard_similarity_score", @@ -327,17 +330,17 @@ "samples_f0.5_score", "samples_f1_score", "samples_f2_score", "samples_precision_score", "samples_recall_score", -] +} # Regression metrics with "multioutput-continuous" format support -MULTIOUTPUT_METRICS = [ +MULTIOUTPUT_METRICS = { "mean_absolute_error", "mean_squared_error", "r2_score", "explained_variance_score" -] +} # Symmetric with respect to their input arguments y_true and y_pred # metric(y_true, y_pred) == metric(y_pred, y_true). -SYMMETRIC_METRICS = [ +SYMMETRIC_METRICS = { "accuracy_score", "unnormalized_accuracy_score", "hamming_loss", "jaccard_similarity_score", "unnormalized_jaccard_similarity_score", @@ -353,11 +356,11 @@ "median_absolute_error", "cohen_kappa_score", -] +} # Asymmetric with respect to their input arguments y_true and y_pred # metric(y_true, y_pred) != metric(y_pred, y_true). -NOT_SYMMETRIC_METRICS = [ +NOT_SYMMETRIC_METRICS = { "balanced_accuracy_score", "explained_variance_score", "r2_score", @@ -370,18 +373,18 @@ "macro_f0.5_score", "macro_f2_score", "macro_precision_score", "macro_recall_score", "log_loss", "hinge_loss" -] +} # No Sample weight support -METRICS_WITHOUT_SAMPLE_WEIGHT = [ +METRICS_WITHOUT_SAMPLE_WEIGHT = { "confusion_matrix", # Left this one here because the tests in this file do # not work for confusion_matrix, as its output is a # matrix instead of a number. Testing of # confusion_matrix with sample_weight is in # test_classification.py "median_absolute_error", -] +} @ignore_warnings @@ -392,13 +395,13 @@ def test_symmetry(): y_pred = random_state.randint(0, 2, size=(20, )) # We shouldn't forget any metrics - assert_equal(set(SYMMETRIC_METRICS).union( - NOT_SYMMETRIC_METRICS, THRESHOLDED_METRICS, + assert_equal(SYMMETRIC_METRICS.union( + NOT_SYMMETRIC_METRICS, set(THRESHOLDED_METRICS), METRIC_UNDEFINED_BINARY_MULTICLASS), set(ALL_METRICS)) assert_equal( - set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)), + SYMMETRIC_METRICS.intersection(NOT_SYMMETRIC_METRICS), set([])) # Symmetric metric @@ -415,17 +418,17 @@ def test_symmetry(): msg="%s seems to be symmetric" % name) -@ignore_warnings -def test_sample_order_invariance(): +@pytest.mark.parametrize( + 'name', + set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +def test_sample_order_invariance(name): random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20, )) y_pred = random_state.randint(0, 2, size=(20, )) y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0) - for name, metric in ALL_METRICS.items(): - if name in METRIC_UNDEFINED_BINARY_MULTICLASS: - continue - + with ignore_warnings(): + metric = ALL_METRICS[name] assert_almost_equal(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" @@ -472,8 +475,10 @@ def test_sample_order_invariance_multilabel_and_multioutput(): % name) -@ignore_warnings -def test_format_invariance_with_1d_vectors(): +@pytest.mark.parametrize( + 'name', + set(ALL_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +def test_format_invariance_with_1d_vectors(name): random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) @@ -489,9 +494,8 @@ def test_format_invariance_with_1d_vectors(): y1_row = np.reshape(y1_1d, (1, -1)) y2_row = np.reshape(y2_1d, (1, -1)) - for name, metric in ALL_METRICS.items(): - if name in METRIC_UNDEFINED_BINARY_MULTICLASS: - continue + with ignore_warnings(): + metric = ALL_METRICS[name] measure = metric(y1, y2) @@ -546,14 +550,16 @@ def test_format_invariance_with_1d_vectors(): # NB: We do not test for y1_row, y2_row as these may be # interpreted as multilabel or multioutput data. - if (name not in (MULTIOUTPUT_METRICS + THRESHOLDED_MULTILABEL_METRICS + + if (name not in (MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTILABELS_METRICS)): assert_raises(ValueError, metric, y1_row, y2_row) -@ignore_warnings -def test_invariance_string_vs_numbers_labels(): - # Ensure that classification metrics with string labels +@pytest.mark.parametrize( + 'name', + set(CLASSIFICATION_METRICS) - METRIC_UNDEFINED_BINARY_MULTICLASS) +def test_classification_invariance_string_vs_numbers_labels(name): + # Ensure that classification metrics with string labels are invariant random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) @@ -564,10 +570,8 @@ def test_invariance_string_vs_numbers_labels(): pos_label_str = "spam" labels_str = ["eggs", "spam"] - for name, metric in CLASSIFICATION_METRICS.items(): - if name in METRIC_UNDEFINED_BINARY_MULTICLASS: - continue - + with ignore_warnings(): + metric = CLASSIFICATION_METRICS[name] measure_with_number = metric(y1, y2) # Ugly, but handle case with a pos_label and label @@ -600,7 +604,20 @@ def test_invariance_string_vs_numbers_labels(): err_msg="{0} failed string vs number " "invariance test".format(name)) - for name, metric in THRESHOLDED_METRICS.items(): + +@pytest.mark.parametrize('name', THRESHOLDED_METRICS) +def test_thresholded_invariance_string_vs_numbers_labels(name): + # Ensure that thresholded metrics with string labels are invariant + random_state = check_random_state(0) + y1 = random_state.randint(0, 2, size=(20, )) + y2 = random_state.randint(0, 2, size=(20, )) + + y1_str = np.array(["eggs", "spam"])[y1] + + pos_label_str = "spam" + + with ignore_warnings(): + metric = THRESHOLDED_METRICS[name] if name not in METRIC_UNDEFINED_BINARY: # Ugly, but handle case with a pos_label and label metric_str = metric @@ -623,28 +640,30 @@ def test_invariance_string_vs_numbers_labels(): assert_raises(ValueError, metric, y1_str.astype('O'), y2) -def test_inf_nan_input(): - invalids =[([0, 1], [np.inf, np.inf]), - ([0, 1], [np.nan, np.nan]), - ([0, 1], [np.nan, np.inf])] +invalids = [([0, 1], [np.inf, np.inf]), + ([0, 1], [np.nan, np.nan]), + ([0, 1], [np.nan, np.inf])] + + +@pytest.mark.parametrize( + 'metric', + chain(THRESHOLDED_METRICS.values(), REGRESSION_METRICS.values())) +def test_regression_thresholded_inf_nan_input(metric): - METRICS = dict() - METRICS.update(THRESHOLDED_METRICS) - METRICS.update(REGRESSION_METRICS) + for y_true, y_score in invalids: + assert_raise_message(ValueError, + "contains NaN, infinity", + metric, y_true, y_score) - for metric in METRICS.values(): - for y_true, y_score in invalids: - assert_raise_message(ValueError, - "contains NaN, infinity", - metric, y_true, y_score) +@pytest.mark.parametrize('metric', CLASSIFICATION_METRICS.values()) +def test_classification_inf_nan_input(metric): # Classification metrics all raise a mixed input exception - for metric in CLASSIFICATION_METRICS.values(): - for y_true, y_score in invalids: - assert_raise_message(ValueError, - "Classification metrics can't handle a mix " - "of binary and continuous targets", - metric, y_true, y_score) + for y_true, y_score in invalids: + assert_raise_message(ValueError, + "Classification metrics can't handle a mix " + "of binary and continuous targets", + metric, y_true, y_score) @ignore_warnings @@ -667,45 +686,47 @@ def check_single_sample_multioutput(name): metric(np.array([[i, j]]), np.array([[k, l]])) -def test_single_sample(): - for name in ALL_METRICS: - if (name in METRIC_UNDEFINED_BINARY_MULTICLASS or - name in THRESHOLDED_METRICS): - # Those metrics are not always defined with one sample - # or in multiclass classification - continue +@pytest.mark.parametrize( + 'name', + (set(ALL_METRICS) + # Those metrics are not always defined with one sample + # or in multiclass classification + - METRIC_UNDEFINED_BINARY_MULTICLASS + - set(THRESHOLDED_METRICS))) +def test_single_sample(name): + check_single_sample(name) - yield check_single_sample, name - for name in MULTIOUTPUT_METRICS + MULTILABELS_METRICS: - yield check_single_sample_multioutput, name +@pytest.mark.parametrize('name', MULTIOUTPUT_METRICS | MULTILABELS_METRICS) +def test_single_sample_multioutput(name): + check_single_sample_multioutput(name) -def test_multioutput_number_of_output_differ(): +@pytest.mark.parametrize('name', MULTIOUTPUT_METRICS) +def test_multioutput_number_of_output_differ(name): y_true = np.array([[1, 0, 0, 1], [0, 1, 1, 1], [1, 1, 0, 1]]) y_pred = np.array([[0, 0], [1, 0], [0, 0]]) - for name in MULTIOUTPUT_METRICS: - metric = ALL_METRICS[name] - assert_raises(ValueError, metric, y_true, y_pred) + metric = ALL_METRICS[name] + assert_raises(ValueError, metric, y_true, y_pred) -def test_multioutput_regression_invariance_to_dimension_shuffling(): +@pytest.mark.parametrize('name', MULTIOUTPUT_METRICS) +def test_multioutput_regression_invariance_to_dimension_shuffling(name): # test invariance to dimension shuffling random_state = check_random_state(0) y_true = random_state.uniform(0, 2, size=(20, 5)) y_pred = random_state.uniform(0, 2, size=(20, 5)) - for name in MULTIOUTPUT_METRICS: - metric = ALL_METRICS[name] - error = metric(y_true, y_pred) + metric = ALL_METRICS[name] + error = metric(y_true, y_pred) - for _ in range(3): - perm = random_state.permutation(y_true.shape[1]) - assert_almost_equal(metric(y_true[:, perm], y_pred[:, perm]), - error, - err_msg="%s is not dimension shuffling " - "invariant" % name) + for _ in range(3): + perm = random_state.permutation(y_true.shape[1]) + assert_almost_equal(metric(y_true[:, perm], y_pred[:, perm]), + error, + err_msg="%s is not dimension shuffling " + "invariant" % name) @ignore_warnings @@ -747,7 +768,8 @@ def test_multilabel_representation_invariance(): "formats." % name) -def test_raise_value_error_multilabel_sequences(): +@pytest.mark.parametrize('name', MULTILABELS_METRICS) +def test_raise_value_error_multilabel_sequences(name): # make sure the multilabel-sequence format raises ValueError multilabel_sequences = [ [[0, 1]], @@ -757,41 +779,41 @@ def test_raise_value_error_multilabel_sequences(): [()], np.array([[], [1, 2]], dtype='object')] - for name in MULTILABELS_METRICS: - metric = ALL_METRICS[name] - for seq in multilabel_sequences: - assert_raises(ValueError, metric, seq, seq) + metric = ALL_METRICS[name] + for seq in multilabel_sequences: + assert_raises(ValueError, metric, seq, seq) -def test_normalize_option_binary_classification(n_samples=20): +@pytest.mark.parametrize('name', METRICS_WITH_NORMALIZE_OPTION) +def test_normalize_option_binary_classification(name): # Test in the binary case + n_samples = 20 random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) - for name in METRICS_WITH_NORMALIZE_OPTION: - metrics = ALL_METRICS[name] - measure = metrics(y_true, y_pred, normalize=True) - assert_greater(measure, 0, - msg="We failed to test correctly the normalize option") - assert_almost_equal(metrics(y_true, y_pred, normalize=False) - / n_samples, measure) + metrics = ALL_METRICS[name] + measure = metrics(y_true, y_pred, normalize=True) + assert_greater(measure, 0, + msg="We failed to test correctly the normalize option") + assert_almost_equal(metrics(y_true, y_pred, normalize=False) + / n_samples, measure) -def test_normalize_option_multiclass_classification(): +@pytest.mark.parametrize('name', METRICS_WITH_NORMALIZE_OPTION) +def test_normalize_option_multiclass_classification(name): # Test in the multiclass case random_state = check_random_state(0) y_true = random_state.randint(0, 4, size=(20, )) y_pred = random_state.randint(0, 4, size=(20, )) n_samples = y_true.shape[0] - for name in METRICS_WITH_NORMALIZE_OPTION: - metrics = ALL_METRICS[name] - measure = metrics(y_true, y_pred, normalize=True) - assert_greater(measure, 0, - msg="We failed to test correctly the normalize option") - assert_almost_equal(metrics(y_true, y_pred, normalize=False) - / n_samples, measure) + metrics = ALL_METRICS[name] + measure = metrics(y_true, y_pred, normalize=True) + assert_greater(measure, 0, + msg="We failed to test correctly the normalize option") + assert_almost_equal(metrics(y_true, y_pred, normalize=False) + / n_samples, measure) def test_normalize_option_multilabel_classification(): @@ -886,7 +908,9 @@ def check_averaging(name, y_true, y_true_binarize, y_pred, y_pred_binarize, raise ValueError("Metric is not recorded as having an average option") -def test_averaging_multiclass(n_samples=50, n_classes=3): +@pytest.mark.parametrize('name', METRICS_WITH_AVERAGING) +def test_averaging_multiclass(name): + n_samples, n_classes = 50, 3 random_state = check_random_state(0) y_true = random_state.randint(0, n_classes, size=(n_samples, )) y_pred = random_state.randint(0, n_classes, size=(n_samples, )) @@ -896,12 +920,14 @@ def test_averaging_multiclass(n_samples=50, n_classes=3): y_true_binarize = lb.transform(y_true) y_pred_binarize = lb.transform(y_pred) - for name in METRICS_WITH_AVERAGING: - yield (check_averaging, name, y_true, y_true_binarize, - y_pred, y_pred_binarize, y_score) + check_averaging(name, y_true, y_true_binarize, + y_pred, y_pred_binarize, y_score) -def test_averaging_multilabel(n_classes=5, n_samples=40): +@pytest.mark.parametrize( + 'name', METRICS_WITH_AVERAGING | THRESHOLDED_METRICS_WITH_AVERAGING) +def test_averaging_multilabel(name): + n_samples, n_classes = 40, 5 _, y = make_multilabel_classification(n_features=1, n_classes=n_classes, random_state=5, n_samples=n_samples, allow_unlabeled=False) @@ -911,22 +937,27 @@ def test_averaging_multilabel(n_classes=5, n_samples=40): y_true_binarize = y_true y_pred_binarize = y_pred - for name in METRICS_WITH_AVERAGING + THRESHOLDED_METRICS_WITH_AVERAGING: - yield (check_averaging, name, y_true, y_true_binarize, - y_pred, y_pred_binarize, y_score) + check_averaging(name, y_true, y_true_binarize, + y_pred, y_pred_binarize, y_score) -def test_averaging_multilabel_all_zeroes(): +@pytest.mark.parametrize('name', METRICS_WITH_AVERAGING) +def test_averaging_multilabel_all_zeroes(name): y_true = np.zeros((20, 3)) y_pred = np.zeros((20, 3)) y_score = np.zeros((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred - for name in METRICS_WITH_AVERAGING: - yield (check_averaging, name, y_true, y_true_binarize, - y_pred, y_pred_binarize, y_score) + check_averaging(name, y_true, y_true_binarize, + y_pred, y_pred_binarize, y_score) + +def test_averaging_binary_multilabel_all_zeroes(): + y_true = np.zeros((20, 3)) + y_pred = np.zeros((20, 3)) + y_true_binarize = y_true + y_pred_binarize = y_pred # Test _average_binary_score for weight.sum() == 0 binary_metric = (lambda y_true, y_score, average="macro": _average_binary_score( @@ -935,16 +966,16 @@ def test_averaging_multilabel_all_zeroes(): y_pred_binarize, is_multilabel=True) -def test_averaging_multilabel_all_ones(): +@pytest.mark.parametrize('name', METRICS_WITH_AVERAGING) +def test_averaging_multilabel_all_ones(name): y_true = np.ones((20, 3)) y_pred = np.ones((20, 3)) y_score = np.ones((20, 3)) y_true_binarize = y_true y_pred_binarize = y_pred - for name in METRICS_WITH_AVERAGING: - yield (check_averaging, name, y_true, y_true_binarize, - y_pred, y_pred_binarize, y_score) + check_averaging(name, y_true, y_true_binarize, + y_pred, y_pred_binarize, y_score) @ignore_warnings @@ -1022,54 +1053,64 @@ def check_sample_weight_invariance(name, metric, y1, y2): sample_weight])) -def test_sample_weight_invariance(n_samples=50): +@pytest.mark.parametrize( + 'name', + (set(ALL_METRICS).intersection(set(REGRESSION_METRICS)) + - METRICS_WITHOUT_SAMPLE_WEIGHT)) +def test_regression_sample_weight_invariance(name): + n_samples = 50 random_state = check_random_state(0) # regression y_true = random_state.random_sample(size=(n_samples,)) y_pred = random_state.random_sample(size=(n_samples,)) - for name in ALL_METRICS: - if name not in REGRESSION_METRICS: - continue - if name in METRICS_WITHOUT_SAMPLE_WEIGHT: - continue - metric = ALL_METRICS[name] - yield check_sample_weight_invariance, name, metric, y_true, y_pred + metric = ALL_METRICS[name] + check_sample_weight_invariance(name, metric, y_true, y_pred) + +@pytest.mark.parametrize( + 'name', + (set(ALL_METRICS) - set(REGRESSION_METRICS) + - METRICS_WITHOUT_SAMPLE_WEIGHT - METRIC_UNDEFINED_BINARY)) +def test_binary_sample_weight_invariance(name): # binary + n_samples = 50 random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples,)) - for name in ALL_METRICS: - if name in REGRESSION_METRICS: - continue - if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or - name in METRIC_UNDEFINED_BINARY): - continue - metric = ALL_METRICS[name] - if name in THRESHOLDED_METRICS: - yield check_sample_weight_invariance, name, metric, y_true, y_score - else: - yield check_sample_weight_invariance, name, metric, y_true, y_pred + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + check_sample_weight_invariance(name, metric, y_true, y_score) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) + +@pytest.mark.parametrize( + 'name', + (set(ALL_METRICS) - set(REGRESSION_METRICS) + - METRICS_WITHOUT_SAMPLE_WEIGHT + - METRIC_UNDEFINED_BINARY_MULTICLASS)) +def test_multiclass_sample_weight_invariance(name): # multiclass + n_samples = 50 random_state = check_random_state(0) y_true = random_state.randint(0, 5, size=(n_samples, )) y_pred = random_state.randint(0, 5, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples, 5)) - for name in ALL_METRICS: - if name in REGRESSION_METRICS: - continue - if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or - name in METRIC_UNDEFINED_BINARY_MULTICLASS): - continue - metric = ALL_METRICS[name] - if name in THRESHOLDED_METRICS: - yield check_sample_weight_invariance, name, metric, y_true, y_score - else: - yield check_sample_weight_invariance, name, metric, y_true, y_pred + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + check_sample_weight_invariance(name, metric, y_true, y_score) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) + +@pytest.mark.parametrize( + 'name', + (MULTILABELS_METRICS | THRESHOLDED_MULTILABEL_METRICS | + MULTIOUTPUT_METRICS) - METRICS_WITHOUT_SAMPLE_WEIGHT) +def test_multilabel_sample_weight_invariance(name): # multilabel indicator + random_state = check_random_state(0) _, ya = make_multilabel_classification(n_features=1, n_classes=20, random_state=0, n_samples=100, allow_unlabeled=False) @@ -1080,18 +1121,11 @@ def test_sample_weight_invariance(n_samples=50): y_pred = np.vstack([ya, ya]) y_score = random_state.randint(1, 4, size=y_true.shape) - for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS + - MULTIOUTPUT_METRICS): - if name in METRICS_WITHOUT_SAMPLE_WEIGHT: - continue - - metric = ALL_METRICS[name] - if name in THRESHOLDED_METRICS: - yield (check_sample_weight_invariance, name, metric, - y_true, y_score) - else: - yield (check_sample_weight_invariance, name, metric, - y_true, y_pred) + metric = ALL_METRICS[name] + if name in THRESHOLDED_METRICS: + check_sample_weight_invariance(name, metric, y_true, y_score) + else: + check_sample_weight_invariance(name, metric, y_true, y_pred) @ignore_warnings diff --git a/sklearn/metrics/tests/test_pairwise.py b/sklearn/metrics/tests/test_pairwise.py index 0ef089c7a3619..e63219a817bed 100644 --- a/sklearn/metrics/tests/test_pairwise.py +++ b/sklearn/metrics/tests/test_pairwise.py @@ -2,11 +2,12 @@ import numpy as np from numpy import linalg -import pytest from scipy.sparse import dok_matrix, csr_matrix, issparse from scipy.spatial.distance import cosine, cityblock, minkowski, wminkowski +import pytest + from sklearn.utils.testing import assert_greater from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_allclose @@ -129,52 +130,52 @@ def test_pairwise_distances(): assert_raises(ValueError, pairwise_distances, X, Y, metric="blah") -# ignore conversion to boolean in pairwise_distances -@ignore_warnings(category=DataConversionWarning) -def test_pairwise_boolean_distance(): +@pytest.mark.parametrize('metric', PAIRWISE_BOOLEAN_FUNCTIONS) +def test_pairwise_boolean_distance(metric): # test that we convert to boolean arrays for boolean distances rng = np.random.RandomState(0) X = rng.randn(5, 4) Y = X.copy() Y[0, 0] = 1 - Y[0, 0] - for metric in PAIRWISE_BOOLEAN_FUNCTIONS: + # ignore conversion to boolean in pairwise_distances + with ignore_warnings(category=DataConversionWarning): for Z in [Y, None]: res = pairwise_distances(X, Z, metric=metric) res[np.isnan(res)] = 0 assert_true(np.sum(res != 0) == 0) -def test_pairwise_precomputed(): - for func in [pairwise_distances, pairwise_kernels]: - # Test correct shape - assert_raises_regexp(ValueError, '.* shape .*', - func, np.zeros((5, 3)), metric='precomputed') - # with two args - assert_raises_regexp(ValueError, '.* shape .*', - func, np.zeros((5, 3)), np.zeros((4, 4)), - metric='precomputed') - # even if shape[1] agrees (although thus second arg is spurious) - assert_raises_regexp(ValueError, '.* shape .*', - func, np.zeros((5, 3)), np.zeros((4, 3)), - metric='precomputed') - - # Test not copied (if appropriate dtype) - S = np.zeros((5, 5)) - S2 = func(S, metric="precomputed") - assert_true(S is S2) - # with two args - S = np.zeros((5, 3)) - S2 = func(S, np.zeros((3, 3)), metric="precomputed") - assert_true(S is S2) - - # Test always returns float dtype - S = func(np.array([[1]], dtype='int'), metric='precomputed') - assert_equal('f', S.dtype.kind) - - # Test converts list to array-like - S = func([[1.]], metric='precomputed') - assert_true(isinstance(S, np.ndarray)) +@pytest.mark.parametrize('func', [pairwise_distances, pairwise_kernels]) +def test_pairwise_precomputed(func): + # Test correct shape + assert_raises_regexp(ValueError, '.* shape .*', + func, np.zeros((5, 3)), metric='precomputed') + # with two args + assert_raises_regexp(ValueError, '.* shape .*', + func, np.zeros((5, 3)), np.zeros((4, 4)), + metric='precomputed') + # even if shape[1] agrees (although thus second arg is spurious) + assert_raises_regexp(ValueError, '.* shape .*', + func, np.zeros((5, 3)), np.zeros((4, 3)), + metric='precomputed') + + # Test not copied (if appropriate dtype) + S = np.zeros((5, 5)) + S2 = func(S, metric="precomputed") + assert_true(S is S2) + # with two args + S = np.zeros((5, 3)) + S2 = func(S, np.zeros((3, 3)), metric="precomputed") + assert_true(S is S2) + + # Test always returns float dtype + S = func(np.array([[1]], dtype='int'), metric='precomputed') + assert_equal('f', S.dtype.kind) + + # Test converts list to array-like + S = func([[1.]], metric='precomputed') + assert_true(isinstance(S, np.ndarray)) def check_pairwise_parallel(func, metric, kwds): @@ -202,16 +203,24 @@ def check_pairwise_parallel(func, metric, kwds): assert_array_almost_equal(S, S2) -def test_pairwise_parallel(): - wminkowski_kwds = {'w': np.arange(1, 5).astype('double'), 'p': 1} - metrics = [(pairwise_distances, 'euclidean', {}), - (pairwise_distances, wminkowski, wminkowski_kwds), - (pairwise_distances, 'wminkowski', wminkowski_kwds), - (pairwise_kernels, 'polynomial', {'degree': 1}), - (pairwise_kernels, callable_rbf_kernel, {'gamma': .1}), - ] - for func, metric, kwds in metrics: - yield check_pairwise_parallel, func, metric, kwds +_wminkowski_kwds = {'w': np.arange(1, 5).astype('double'), 'p': 1} + + +def callable_rbf_kernel(x, y, **kwds): + # Callable version of pairwise.rbf_kernel. + K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds) + return K + + +@pytest.mark.parametrize( + 'func, metric, kwds', + [(pairwise_distances, 'euclidean', {}), + (pairwise_distances, wminkowski, _wminkowski_kwds), + (pairwise_distances, 'wminkowski', _wminkowski_kwds), + (pairwise_kernels, 'polynomial', {'degree': 1}), + (pairwise_kernels, callable_rbf_kernel, {'gamma': .1})]) +def test_pairwise_parallel(func, metric, kwds): + check_pairwise_parallel(func, metric, kwds) def test_pairwise_callable_nonstrict_metric(): @@ -221,47 +230,51 @@ def test_pairwise_callable_nonstrict_metric(): assert_equal(pairwise_distances([[1.]], metric=lambda x, y: 5)[0, 0], 5) -def callable_rbf_kernel(x, y, **kwds): - # Callable version of pairwise.rbf_kernel. - K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds) - return K +# Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS. +@pytest.mark.parametrize( + 'metric', + ["rbf", "laplacian", "sigmoid", "polynomial", "linear", + "chi2", "additive_chi2"]) +def test_pairwise_kernels(metric): + # Test the pairwise_kernels helper function. + + rng = np.random.RandomState(0) + X = rng.random_sample((5, 4)) + Y = rng.random_sample((2, 4)) + function = PAIRWISE_KERNEL_FUNCTIONS[metric] + # Test with Y=None + K1 = pairwise_kernels(X, metric=metric) + K2 = function(X) + assert_array_almost_equal(K1, K2) + # Test with Y=Y + K1 = pairwise_kernels(X, Y=Y, metric=metric) + K2 = function(X, Y=Y) + assert_array_almost_equal(K1, K2) + # Test with tuples as X and Y + X_tuples = tuple([tuple([v for v in row]) for row in X]) + Y_tuples = tuple([tuple([v for v in row]) for row in Y]) + K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) + assert_array_almost_equal(K1, K2) + # Test with sparse X and Y + X_sparse = csr_matrix(X) + Y_sparse = csr_matrix(Y) + if metric in ["chi2", "additive_chi2"]: + # these don't support sparse matrices yet + assert_raises(ValueError, pairwise_kernels, + X_sparse, Y=Y_sparse, metric=metric) + return + K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) + assert_array_almost_equal(K1, K2) -def test_pairwise_kernels(): # Test the pairwise_kernels helper function. +def test_pairwise_kernels_callable(): + # Test the pairwise_kernels helper function + # with a callable function, with given keywords. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((2, 4)) - # Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS. - test_metrics = ["rbf", "laplacian", "sigmoid", "polynomial", "linear", - "chi2", "additive_chi2"] - for metric in test_metrics: - function = PAIRWISE_KERNEL_FUNCTIONS[metric] - # Test with Y=None - K1 = pairwise_kernels(X, metric=metric) - K2 = function(X) - assert_array_almost_equal(K1, K2) - # Test with Y=Y - K1 = pairwise_kernels(X, Y=Y, metric=metric) - K2 = function(X, Y=Y) - assert_array_almost_equal(K1, K2) - # Test with tuples as X and Y - X_tuples = tuple([tuple([v for v in row]) for row in X]) - Y_tuples = tuple([tuple([v for v in row]) for row in Y]) - K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) - assert_array_almost_equal(K1, K2) - # Test with sparse X and Y - X_sparse = csr_matrix(X) - Y_sparse = csr_matrix(Y) - if metric in ["chi2", "additive_chi2"]: - # these don't support sparse matrices yet - assert_raises(ValueError, pairwise_kernels, - X_sparse, Y=Y_sparse, metric=metric) - continue - K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) - assert_array_almost_equal(K1, K2) - # Test with a callable function, with given keywords. metric = callable_rbf_kernel kwds = {'gamma': 0.1} K1 = pairwise_kernels(X, Y=Y, metric=metric, **kwds) @@ -286,27 +299,37 @@ def test_pairwise_kernels_filter_param(): assert_raises(TypeError, pairwise_kernels, X, Y, "rbf", **params) -def test_paired_distances(): +@pytest.mark.parametrize('metric, func', iteritems(PAIRED_DISTANCES)) +def test_paired_distances(metric, func): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) # Euclidean distance, with Y != X. Y = rng.random_sample((5, 4)) - for metric, func in iteritems(PAIRED_DISTANCES): - S = paired_distances(X, Y, metric=metric) - S2 = func(X, Y) - assert_array_almost_equal(S, S2) - S3 = func(csr_matrix(X), csr_matrix(Y)) - assert_array_almost_equal(S, S3) - if metric in PAIRWISE_DISTANCE_FUNCTIONS: - # Check the pairwise_distances implementation - # gives the same value - distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) - distances = np.diag(distances) - assert_array_almost_equal(distances, S) - - # Check the callable implementation + + S = paired_distances(X, Y, metric=metric) + S2 = func(X, Y) + assert_array_almost_equal(S, S2) + S3 = func(csr_matrix(X), csr_matrix(Y)) + assert_array_almost_equal(S, S3) + if metric in PAIRWISE_DISTANCE_FUNCTIONS: + # Check the pairwise_distances implementation + # gives the same value + distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) + distances = np.diag(distances) + assert_array_almost_equal(distances, S) + + +def test_paired_distances_callable(): + # Test the pairwise_distance helper function + # with the callable implementation + rng = np.random.RandomState(0) + # Euclidean distance should be equivalent to calling the function. + X = rng.random_sample((5, 4)) + # Euclidean distance, with Y != X. + Y = rng.random_sample((5, 4)) + S = paired_distances(X, Y, metric='manhattan') S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0)) assert_array_almost_equal(S, S2) @@ -637,25 +660,29 @@ def test_chi_square_kernel(): csr_matrix(X), csr_matrix(Y)) -def test_kernel_symmetry(): +@pytest.mark.parametrize( + 'kernel', + (linear_kernel, polynomial_kernel, rbf_kernel, + laplacian_kernel, sigmoid_kernel, cosine_similarity)) +def test_kernel_symmetry(kernel): # Valid kernels should be symmetric rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) - for kernel in (linear_kernel, polynomial_kernel, rbf_kernel, - laplacian_kernel, sigmoid_kernel, cosine_similarity): - K = kernel(X, X) - assert_array_almost_equal(K, K.T, 15) + K = kernel(X, X) + assert_array_almost_equal(K, K.T, 15) -def test_kernel_sparse(): +@pytest.mark.parametrize( + 'kernel', + (linear_kernel, polynomial_kernel, rbf_kernel, + laplacian_kernel, sigmoid_kernel, cosine_similarity)) +def test_kernel_sparse(kernel): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) X_sparse = csr_matrix(X) - for kernel in (linear_kernel, polynomial_kernel, rbf_kernel, - laplacian_kernel, sigmoid_kernel, cosine_similarity): - K = kernel(X, X) - K2 = kernel(X_sparse, X_sparse) - assert_array_almost_equal(K, K2) + K = kernel(X, X) + K2 = kernel(X_sparse, X_sparse) + assert_array_almost_equal(K, K2) def test_linear_kernel(): diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py index 07c35c609358d..28b79e9b8474c 100644 --- a/sklearn/metrics/tests/test_ranking.py +++ b/sklearn/metrics/tests/test_ranking.py @@ -2,7 +2,6 @@ import pytest import numpy as np -from itertools import product import warnings from scipy.sparse import csr_matrix @@ -177,19 +176,19 @@ def _partial_roc(y_true, y_predict, max_fpr): return 0.5 * (1 + (partial_auc - min_area) / (max_area - min_area)) -def test_roc_curve(): +@pytest.mark.parametrize('drop', [True, False]) +def test_roc_curve(drop): # Test Area under Receiver Operating Characteristic (ROC) curve y_true, _, probas_pred = make_prediction(binary=True) expected_auc = _auc(y_true, probas_pred) - for drop in [True, False]: - fpr, tpr, thresholds = roc_curve(y_true, probas_pred, - drop_intermediate=drop) - roc_auc = auc(fpr, tpr) - assert_array_almost_equal(roc_auc, expected_auc, decimal=2) - assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred)) - assert_equal(fpr.shape, tpr.shape) - assert_equal(fpr.shape, thresholds.shape) + fpr, tpr, thresholds = roc_curve(y_true, probas_pred, + drop_intermediate=drop) + roc_auc = auc(fpr, tpr) + assert_array_almost_equal(roc_auc, expected_auc, decimal=2) + assert_almost_equal(roc_auc, roc_auc_score(y_true, probas_pred)) + assert_equal(fpr.shape, tpr.shape) + assert_equal(fpr.shape, thresholds.shape) def test_roc_curve_end_points(): @@ -923,18 +922,29 @@ def check_alternative_lrap_implementation(lrap_score, n_classes=5, assert_almost_equal(score_lrap, score_my_lrap) -def test_label_ranking_avp(): - for fn in [label_ranking_average_precision_score, _my_lrap]: - yield check_lrap_toy, fn - yield check_lrap_without_tie_and_increasing_score, fn - yield check_lrap_only_ties, fn - yield check_zero_or_all_relevant_labels, fn - yield check_lrap_error_raised, label_ranking_average_precision_score +@pytest.mark.parametrize( + 'check', + (check_lrap_toy, + check_lrap_without_tie_and_increasing_score, + check_lrap_only_ties, + check_zero_or_all_relevant_labels)) +@pytest.mark.parametrize( + 'func', + (label_ranking_average_precision_score, _my_lrap)) +def test_label_ranking_avp(check, func): + check(func) + + +def test_lrap_error_raised(): + check_lrap_error_raised(label_ranking_average_precision_score) + + +@pytest.mark.parametrize('n_samples', (1, 2, 8, 20)) +@pytest.mark.parametrize('n_classes', (2, 5, 10)) +@pytest.mark.parametrize('random_state', range(1)) +def test_alternative_lrap_implementation(n_samples, n_classes, random_state): - for n_samples, n_classes, random_state in product((1, 2, 8, 20), - (2, 5, 10), - range(1)): - yield (check_alternative_lrap_implementation, + check_alternative_lrap_implementation( label_ranking_average_precision_score, n_classes, n_samples, random_state) diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py index 6af6418635d59..8bb3c3c137dcc 100644 --- a/sklearn/metrics/tests/test_score_objects.py +++ b/sklearn/metrics/tests/test_score_objects.py @@ -6,6 +6,8 @@ import numpy as np +import pytest + from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal @@ -491,12 +493,12 @@ def check_scorer_memmap(scorer_name): assert isinstance(score, numbers.Number), scorer_name -def test_scorer_memmap_input(): +@pytest.mark.parametrize('name', SCORERS) +def test_scorer_memmap_input(name): # Non-regression test for #6147: some score functions would # return singleton memmap when computed on memmap data instead of scalar # float values. - for name in SCORERS.keys(): - yield check_scorer_memmap, name + check_scorer_memmap(name) def test_deprecated_names(): diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py index 137703adfcad4..134c0493cf558 100644 --- a/sklearn/mixture/tests/test_gmm.py +++ b/sklearn/mixture/tests/test_gmm.py @@ -8,6 +8,8 @@ import copy import sys +import pytest + import numpy as np from numpy.testing import assert_array_equal, assert_array_almost_equal @@ -160,7 +162,6 @@ def test_GMM_attributes(): assert_raises(ValueError, g._set_covars, []) assert_raises(ValueError, g._set_covars, np.zeros((n_components - 2, n_features))) - assert_raises(ValueError, mixture.GMM, n_components=20, covariance_type='badcovariance_type') @@ -496,10 +497,11 @@ def check_positive_definite_covars(covariance_type): assert_greater(np.linalg.det(c), 0) -def test_positive_definite_covars(): +@pytest.mark.parametrize('covariance_type', + ["full", "tied", "diag", "spherical"]) +def test_positive_definite_covars(covariance_type): # Check positive definiteness for all covariance types - for covariance_type in ["full", "tied", "diag", "spherical"]: - yield check_positive_definite_covars, covariance_type + check_positive_definite_covars(covariance_type) # This function tests the deprecated old GMM class diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py index 2929916619769..a537b9f53518a 100644 --- a/sklearn/model_selection/tests/test_validation.py +++ b/sklearn/model_selection/tests/test_validation.py @@ -387,8 +387,8 @@ def test_cross_validate(): scores = (train_mse_scores, test_mse_scores, train_r2_scores, test_r2_scores, fitted_estimators) - yield check_cross_validate_single_metric, est, X, y, scores - yield check_cross_validate_multi_metric, est, X, y, scores + check_cross_validate_single_metric(est, X, y, scores) + check_cross_validate_multi_metric(est, X, y, scores) def test_cross_validate_return_train_score_warn(): diff --git a/sklearn/neighbors/tests/test_ball_tree.py b/sklearn/neighbors/tests/test_ball_tree.py index a91e4ac4edd27..de0d166fb8891 100644 --- a/sklearn/neighbors/tests/test_ball_tree.py +++ b/sklearn/neighbors/tests/test_ball_tree.py @@ -1,12 +1,15 @@ import pickle +import itertools + import numpy as np +import pytest from numpy.testing import assert_array_almost_equal from sklearn.neighbors.ball_tree import (BallTree, NeighborsHeap, simultaneous_sort, kernel_norm, nodeheap_sort, DTYPE, ITYPE) from sklearn.neighbors.dist_metrics import DistanceMetric from sklearn.utils import check_random_state -from sklearn.utils.testing import SkipTest, assert_allclose +from sklearn.utils.testing import assert_allclose rng = np.random.RandomState(10) V_mahalanobis = rng.rand(3, 3) @@ -42,60 +45,44 @@ def brute_force_neighbors(X, Y, k, metric, **kwargs): return dist, ind -def test_ball_tree_query(): +@pytest.mark.parametrize('metric', METRICS) +@pytest.mark.parametrize('k', (1, 3, 5)) +@pytest.mark.parametrize('dualtree', (True, False)) +@pytest.mark.parametrize('breadth_first', (True, False)) +def test_ball_tree_query(metric, k, dualtree, breadth_first): rng = check_random_state(0) X = rng.random_sample((40, DIMENSION)) Y = rng.random_sample((10, DIMENSION)) - def check_neighbors(dualtree, breadth_first, k, metric, kwargs): - bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) - dist1, ind1 = bt.query(Y, k, dualtree=dualtree, - breadth_first=breadth_first) - dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) + kwargs = METRICS[metric] - # don't check indices here: if there are any duplicate distances, - # the indices may not match. Distances should not have this problem. - assert_array_almost_equal(dist1, dist2) + bt = BallTree(X, leaf_size=1, metric=metric, **kwargs) + dist1, ind1 = bt.query(Y, k, dualtree=dualtree, + breadth_first=breadth_first) + dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs) - for (metric, kwargs) in METRICS.items(): - for k in (1, 3, 5): - for dualtree in (True, False): - for breadth_first in (True, False): - yield (check_neighbors, - dualtree, breadth_first, - k, metric, kwargs) + # don't check indices here: if there are any duplicate distances, + # the indices may not match. Distances should not have this problem. + assert_array_almost_equal(dist1, dist2) -def test_ball_tree_query_boolean_metrics(): +@pytest.mark.parametrize('metric', + itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS)) +def test_ball_tree_query_metrics(metric): rng = check_random_state(0) - X = rng.random_sample((40, 10)).round(0) - Y = rng.random_sample((10, 10)).round(0) - k = 5 - - def check_neighbors(metric): - bt = BallTree(X, leaf_size=1, metric=metric) - dist1, ind1 = bt.query(Y, k) - dist2, ind2 = brute_force_neighbors(X, Y, k, metric) - assert_array_almost_equal(dist1, dist2) + if metric in BOOLEAN_METRICS: + X = rng.random_sample((40, 10)).round(0) + Y = rng.random_sample((10, 10)).round(0) + elif metric in DISCRETE_METRICS: + X = (4 * rng.random_sample((40, 10))).round(0) + Y = (4 * rng.random_sample((10, 10))).round(0) - for metric in BOOLEAN_METRICS: - yield check_neighbors, metric - - -def test_ball_tree_query_discrete_metrics(): - rng = check_random_state(0) - X = (4 * rng.random_sample((40, 10))).round(0) - Y = (4 * rng.random_sample((10, 10))).round(0) k = 5 - def check_neighbors(metric): - bt = BallTree(X, leaf_size=1, metric=metric) - dist1, ind1 = bt.query(Y, k) - dist2, ind2 = brute_force_neighbors(X, Y, k, metric) - assert_array_almost_equal(dist1, dist2) - - for metric in DISCRETE_METRICS: - yield check_neighbors, metric + bt = BallTree(X, leaf_size=1, metric=metric) + dist1, ind1 = bt.query(Y, k) + dist2, ind2 = brute_force_neighbors(X, Y, k, metric) + assert_array_almost_equal(dist1, dist2) def test_ball_tree_query_radius(n_samples=100, n_features=10): @@ -157,7 +144,21 @@ def compute_kernel_slow(Y, X, kernel, h): raise ValueError('kernel not recognized') -def check_results(kernel, h, atol, rtol, breadth_first, bt, Y, dens_true): +@pytest.mark.parametrize("kernel", ['gaussian', 'tophat', 'epanechnikov', + 'exponential', 'linear', 'cosine']) +@pytest.mark.parametrize("h", [0.01, 0.1, 1]) +@pytest.mark.parametrize("rtol", [0, 1E-5]) +@pytest.mark.parametrize("atol", [1E-6, 1E-2]) +@pytest.mark.parametrize("breadth_first", [True, False]) +def test_ball_tree_kde(kernel, h, rtol, atol, breadth_first, n_samples=100, + n_features=3): + np.random.seed(0) + X = np.random.random((n_samples, n_features)) + Y = np.random.random((n_samples, n_features)) + bt = BallTree(X, leaf_size=10) + + dens_true = compute_kernel_slow(Y, X, kernel, h) + dens = bt.kernel_density(Y, h, atol=atol, rtol=rtol, kernel=kernel, breadth_first=breadth_first) @@ -165,24 +166,6 @@ def check_results(kernel, h, atol, rtol, breadth_first, bt, Y, dens_true): atol=atol, rtol=max(rtol, 1e-7)) -def test_ball_tree_kde(n_samples=100, n_features=3): - rng = check_random_state(0) - X = rng.random_sample((n_samples, n_features)) - Y = rng.random_sample((n_samples, n_features)) - bt = BallTree(X, leaf_size=10) - - for kernel in ['gaussian', 'tophat', 'epanechnikov', - 'exponential', 'linear', 'cosine']: - for h in [0.01, 0.1, 1]: - dens_true = compute_kernel_slow(Y, X, kernel, h) - - for rtol in [0, 1E-5]: - for atol in [1E-6, 1E-2]: - for breadth_first in (True, False): - yield (check_results, kernel, h, atol, rtol, - breadth_first, bt, Y, dens_true) - - def test_gaussian_kde(n_samples=1000): # Compare gaussian KDE results to scipy.stats.gaussian_kde from scipy.stats import gaussian_kde @@ -215,7 +198,7 @@ def check_two_point(r, dualtree): assert_array_almost_equal(counts, counts_true) for dualtree in (True, False): - yield check_two_point, r, dualtree + check_two_point(r, dualtree) def test_ball_tree_pickle(): @@ -246,7 +229,7 @@ def check_pickle_protocol(protocol): assert_array_almost_equal(dist1_pyfunc, dist2_pyfunc) for protocol in (0, 1, 2): - yield check_pickle_protocol, protocol + check_pickle_protocol(protocol) def test_neighbors_heap(n_pts=5, n_nbrs=10): diff --git a/sklearn/neighbors/tests/test_dist_metrics.py b/sklearn/neighbors/tests/test_dist_metrics.py index 23b7656cb313b..f4d6dc3e74c5e 100644 --- a/sklearn/neighbors/tests/test_dist_metrics.py +++ b/sklearn/neighbors/tests/test_dist_metrics.py @@ -4,6 +4,8 @@ import numpy as np from numpy.testing import assert_array_almost_equal +import pytest + from scipy.spatial.distance import cdist from sklearn.neighbors.dist_metrics import DistanceMetric from sklearn.neighbors import BallTree @@ -15,107 +17,117 @@ def dist_func(x1, x2, p): return np.sum((x1 - x2) ** p) ** (1. / p) -class TestMetrics(object): - n1 = 20 - n2 = 25 - d = 4 - zero_frac = 0.5 - rseed = 0 - dtype = np.float64 - rng = check_random_state(rseed) - X1 = rng.random_sample((n1, d)).astype(dtype) - X2 = rng.random_sample((n2, d)).astype(dtype) - - # make boolean arrays: ones and zeros - X1_bool = X1.round(0) - X2_bool = X2.round(0) - - V = rng.random_sample((d, d)) - VI = np.dot(V, V.T) - - metrics = {'euclidean': {}, - 'cityblock': {}, - 'minkowski': dict(p=(1, 1.5, 2, 3)), - 'chebyshev': {}, - 'seuclidean': dict(V=(rng.random_sample(d),)), - 'wminkowski': dict(p=(1, 1.5, 3), - w=(rng.random_sample(d),)), - 'mahalanobis': dict(VI=(VI,)), - 'hamming': {}, - 'canberra': {}, - 'braycurtis': {}} - - bool_metrics = ['matching', 'jaccard', 'dice', - 'kulsinski', 'rogerstanimoto', 'russellrao', - 'sokalmichener', 'sokalsneath'] - - def test_cdist(self): - for metric, argdict in self.metrics.items(): - keys = argdict.keys() - for vals in itertools.product(*argdict.values()): - kwargs = dict(zip(keys, vals)) - D_true = cdist(self.X1, self.X2, metric, **kwargs) - yield self.check_cdist, metric, kwargs, D_true - - for metric in self.bool_metrics: - D_true = cdist(self.X1_bool, self.X2_bool, metric) - yield self.check_cdist_bool, metric, D_true - - def check_cdist(self, metric, kwargs, D_true): - dm = DistanceMetric.get_metric(metric, **kwargs) - D12 = dm.pairwise(self.X1, self.X2) - assert_array_almost_equal(D12, D_true) - - def check_cdist_bool(self, metric, D_true): - dm = DistanceMetric.get_metric(metric) - D12 = dm.pairwise(self.X1_bool, self.X2_bool) - assert_array_almost_equal(D12, D_true) - - def test_pdist(self): - for metric, argdict in self.metrics.items(): - keys = argdict.keys() - for vals in itertools.product(*argdict.values()): - kwargs = dict(zip(keys, vals)) - D_true = cdist(self.X1, self.X1, metric, **kwargs) - yield self.check_pdist, metric, kwargs, D_true - - for metric in self.bool_metrics: - D_true = cdist(self.X1_bool, self.X1_bool, metric) - yield self.check_pdist_bool, metric, D_true - - def check_pdist(self, metric, kwargs, D_true): - dm = DistanceMetric.get_metric(metric, **kwargs) - D12 = dm.pairwise(self.X1) - assert_array_almost_equal(D12, D_true) - - def check_pdist_bool(self, metric, D_true): - dm = DistanceMetric.get_metric(metric) - D12 = dm.pairwise(self.X1_bool) - assert_array_almost_equal(D12, D_true) - - def test_pickle(self): - for metric, argdict in self.metrics.items(): - keys = argdict.keys() - for vals in itertools.product(*argdict.values()): - kwargs = dict(zip(keys, vals)) - yield self.check_pickle, metric, kwargs - - for metric in self.bool_metrics: - yield self.check_pickle_bool, metric - - def check_pickle_bool(self, metric): - dm = DistanceMetric.get_metric(metric) - D1 = dm.pairwise(self.X1_bool) - dm2 = pickle.loads(pickle.dumps(dm)) - D2 = dm2.pairwise(self.X1_bool) - assert_array_almost_equal(D1, D2) - - def check_pickle(self, metric, kwargs): - dm = DistanceMetric.get_metric(metric, **kwargs) - D1 = dm.pairwise(self.X1) - dm2 = pickle.loads(pickle.dumps(dm)) - D2 = dm2.pairwise(self.X1) - assert_array_almost_equal(D1, D2) +rng = check_random_state(0) +d = 4 +n1 = 20 +n2 = 25 +X1 = rng.random_sample((n1, d)).astype('float64') +X2 = rng.random_sample((n2, d)).astype('float64') + +# make boolean arrays: ones and zeros +X1_bool = X1.round(0) +X2_bool = X2.round(0) + +V = rng.random_sample((d, d)) +VI = np.dot(V, V.T) + +BOOL_METRICS = ['matching', 'jaccard', 'dice', + 'kulsinski', 'rogerstanimoto', 'russellrao', + 'sokalmichener', 'sokalsneath'] + +METRICS_DEFAULT_PARAMS = {'euclidean': {}, + 'cityblock': {}, + 'minkowski': dict(p=(1, 1.5, 2, 3)), + 'chebyshev': {}, + 'seuclidean': dict(V=(rng.random_sample(d),)), + 'wminkowski': dict(p=(1, 1.5, 3), + w=(rng.random_sample(d),)), + 'mahalanobis': dict(VI=(VI,)), + 'hamming': {}, + 'canberra': {}, + 'braycurtis': {}} + + +@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) +def test_cdist(metric): + argdict = METRICS_DEFAULT_PARAMS[metric] + keys = argdict.keys() + for vals in itertools.product(*argdict.values()): + kwargs = dict(zip(keys, vals)) + D_true = cdist(X1, X2, metric, **kwargs) + check_cdist(metric, kwargs, D_true) + + +@pytest.mark.parametrize('metric', BOOL_METRICS) +def test_cdist_bool_metric(metric): + D_true = cdist(X1_bool, X2_bool, metric) + check_cdist_bool(metric, D_true) + + +def check_cdist(metric, kwargs, D_true): + dm = DistanceMetric.get_metric(metric, **kwargs) + D12 = dm.pairwise(X1, X2) + assert_array_almost_equal(D12, D_true) + + +def check_cdist_bool(metric, D_true): + dm = DistanceMetric.get_metric(metric) + D12 = dm.pairwise(X1_bool, X2_bool) + assert_array_almost_equal(D12, D_true) + + +@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) +def test_pdist(metric): + argdict = METRICS_DEFAULT_PARAMS[metric] + keys = argdict.keys() + for vals in itertools.product(*argdict.values()): + kwargs = dict(zip(keys, vals)) + D_true = cdist(X1, X1, metric, **kwargs) + check_pdist(metric, kwargs, D_true) + + +@pytest.mark.parametrize('metric', BOOL_METRICS) +def test_pdist_bool_metrics(metric): + D_true = cdist(X1_bool, X1_bool, metric) + check_pdist_bool(metric, D_true) + + +def check_pdist(metric, kwargs, D_true): + dm = DistanceMetric.get_metric(metric, **kwargs) + D12 = dm.pairwise(X1) + assert_array_almost_equal(D12, D_true) + + +def check_pdist_bool(metric, D_true): + dm = DistanceMetric.get_metric(metric) + D12 = dm.pairwise(X1_bool) + assert_array_almost_equal(D12, D_true) + + +@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS) +def test_pickle(metric): + argdict = METRICS_DEFAULT_PARAMS[metric] + keys = argdict.keys() + for vals in itertools.product(*argdict.values()): + kwargs = dict(zip(keys, vals)) + check_pickle(metric, kwargs) + + +@pytest.mark.parametrize('metric', BOOL_METRICS) +def test_pickle_bool_metrics(metric): + dm = DistanceMetric.get_metric(metric) + D1 = dm.pairwise(X1_bool) + dm2 = pickle.loads(pickle.dumps(dm)) + D2 = dm2.pairwise(X1_bool) + assert_array_almost_equal(D1, D2) + + +def check_pickle(metric, kwargs): + dm = DistanceMetric.get_metric(metric, **kwargs) + D1 = dm.pairwise(X1) + dm2 = pickle.loads(pickle.dumps(dm)) + D2 = dm2.pairwise(X1) + assert_array_almost_equal(D1, D2) def test_haversine_metric(): diff --git a/sklearn/neighbors/tests/test_kd_tree.py b/sklearn/neighbors/tests/test_kd_tree.py index e1b7cb1965987..46cddc711e769 100644 --- a/sklearn/neighbors/tests/test_kd_tree.py +++ b/sklearn/neighbors/tests/test_kd_tree.py @@ -1,5 +1,8 @@ import numpy as np from numpy.testing import assert_array_almost_equal + +import pytest + from sklearn.neighbors.kd_tree import (KDTree, NeighborsHeap, simultaneous_sort, kernel_norm, nodeheap_sort, DTYPE, ITYPE) @@ -37,18 +40,17 @@ def check_neighbors(dualtree, breadth_first, k, metric, X, Y, kwargs): assert_array_almost_equal(dist1, dist2) -def test_kd_tree_query(): +@pytest.mark.parametrize('metric', METRICS) +@pytest.mark.parametrize('k', (1, 3, 5)) +@pytest.mark.parametrize('dualtree', (True, False)) +@pytest.mark.parametrize('breadth_first', (True, False)) +def test_kd_tree_query(metric, k, dualtree, breadth_first): rng = check_random_state(0) X = rng.random_sample((40, DIMENSION)) Y = rng.random_sample((10, DIMENSION)) - for (metric, kwargs) in METRICS.items(): - for k in (1, 3, 5): - for dualtree in (True, False): - for breadth_first in (True, False): - yield (check_neighbors, - dualtree, breadth_first, - k, metric, X, Y, kwargs) + kwargs = METRICS[metric] + check_neighbors(dualtree, breadth_first, k, metric, X, Y, kwargs) def test_kd_tree_query_radius(n_samples=100, n_features=10): @@ -118,22 +120,24 @@ def check_results(kernel, h, atol, rtol, breadth_first, Y, kdt, dens_true): rtol=max(rtol, 1e-7)) -def test_kd_tree_kde(n_samples=100, n_features=3): +@pytest.mark.parametrize('kernel', + ['gaussian', 'tophat', 'epanechnikov', + 'exponential', 'linear', 'cosine']) +@pytest.mark.parametrize('h', [0.01, 0.1, 1]) +def test_kd_tree_kde(kernel, h): + n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) kdt = KDTree(X, leaf_size=10) - for kernel in ['gaussian', 'tophat', 'epanechnikov', - 'exponential', 'linear', 'cosine']: - for h in [0.01, 0.1, 1]: - dens_true = compute_kernel_slow(Y, X, kernel, h) + dens_true = compute_kernel_slow(Y, X, kernel, h) - for rtol in [0, 1E-5]: - for atol in [1E-6, 1E-2]: - for breadth_first in (True, False): - yield (check_results, kernel, h, atol, rtol, - breadth_first, Y, kdt, dens_true) + for rtol in [0, 1E-5]: + for atol in [1E-6, 1E-2]: + for breadth_first in (True, False): + check_results(kernel, h, atol, rtol, + breadth_first, Y, kdt, dens_true) def test_gaussian_kde(n_samples=1000): @@ -153,7 +157,9 @@ def test_gaussian_kde(n_samples=1000): assert_array_almost_equal(dens_kdt, dens_gkde, decimal=3) -def test_kd_tree_two_point(n_samples=100, n_features=3): +@pytest.mark.parametrize('dualtree', (True, False)) +def test_kd_tree_two_point(dualtree): + n_samples, n_features = (100, 3) rng = check_random_state(0) X = rng.random_sample((n_samples, n_features)) Y = rng.random_sample((n_samples, n_features)) @@ -163,15 +169,12 @@ def test_kd_tree_two_point(n_samples=100, n_features=3): D = DistanceMetric.get_metric("euclidean").pairwise(Y, X) counts_true = [(D <= ri).sum() for ri in r] - def check_two_point(r, dualtree): - counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) - assert_array_almost_equal(counts, counts_true) - - for dualtree in (True, False): - yield check_two_point, r, dualtree + counts = kdt.two_point_correlation(Y, r=r, dualtree=dualtree) + assert_array_almost_equal(counts, counts_true) -def test_kd_tree_pickle(): +@pytest.mark.parametrize('protocol', (0, 1, 2)) +def test_kd_tree_pickle(protocol): import pickle rng = check_random_state(0) X = rng.random_sample((10, 3)) @@ -185,8 +188,7 @@ def check_pickle_protocol(protocol): assert_array_almost_equal(ind1, ind2) assert_array_almost_equal(dist1, dist2) - for protocol in (0, 1, 2): - yield check_pickle_protocol, protocol + check_pickle_protocol(protocol) def test_neighbors_heap(n_pts=5, n_nbrs=10): diff --git a/sklearn/neighbors/tests/test_kde.py b/sklearn/neighbors/tests/test_kde.py index 60f294a3df0a9..caffb662608e0 100644 --- a/sklearn/neighbors/tests/test_kde.py +++ b/sklearn/neighbors/tests/test_kde.py @@ -1,4 +1,7 @@ import numpy as np + +import pytest + from sklearn.utils.testing import (assert_allclose, assert_raises, assert_equal) from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors @@ -40,21 +43,25 @@ def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true): atol=atol, rtol=max(1E-7, rtol)) -def test_kernel_density(n_samples=100, n_features=3): +@pytest.mark.parametrize( + 'kernel', + ['gaussian', 'tophat', 'epanechnikov', + 'exponential', 'linear', 'cosine']) +@pytest.mark.parametrize('bandwidth', [0.01, 0.1, 1]) +def test_kernel_density(kernel, bandwidth): + n_samples, n_features = (100, 3) + rng = np.random.RandomState(0) X = rng.randn(n_samples, n_features) Y = rng.randn(n_samples, n_features) - for kernel in ['gaussian', 'tophat', 'epanechnikov', - 'exponential', 'linear', 'cosine']: - for bandwidth in [0.01, 0.1, 1]: - dens_true = compute_kernel_slow(Y, X, kernel, bandwidth) + dens_true = compute_kernel_slow(Y, X, kernel, bandwidth) - for rtol in [0, 1E-5]: - for atol in [1E-6, 1E-2]: - for breadth_first in (True, False): - yield (check_results, kernel, bandwidth, atol, rtol, - X, Y, dens_true) + for rtol in [0, 1E-5]: + for atol in [1E-6, 1E-2]: + for breadth_first in (True, False): + check_results(kernel, bandwidth, atol, rtol, + X, Y, dens_true) def test_kernel_density_sampling(n_samples=100, n_features=3): @@ -91,23 +98,24 @@ def test_kernel_density_sampling(n_samples=100, n_features=3): assert_equal(kde.sample().shape, (1, 1)) -def test_kde_algorithm_metric_choice(): +@pytest.mark.parametrize('algorithm', ['auto', 'ball_tree', 'kd_tree']) +@pytest.mark.parametrize('metric', + ['euclidean', 'minkowski', 'manhattan', + 'chebyshev', 'haversine']) +def test_kde_algorithm_metric_choice(algorithm, metric): # Smoke test for various metrics and algorithms rng = np.random.RandomState(0) X = rng.randn(10, 2) # 2 features required for haversine dist. Y = rng.randn(10, 2) - for algorithm in ['auto', 'ball_tree', 'kd_tree']: - for metric in ['euclidean', 'minkowski', 'manhattan', - 'chebyshev', 'haversine']: - if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics: - assert_raises(ValueError, KernelDensity, - algorithm=algorithm, metric=metric) - else: - kde = KernelDensity(algorithm=algorithm, metric=metric) - kde.fit(X) - y_dens = kde.score_samples(Y) - assert_equal(y_dens.shape, Y.shape[:1]) + if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics: + assert_raises(ValueError, KernelDensity, + algorithm=algorithm, metric=metric) + else: + kde = KernelDensity(algorithm=algorithm, metric=metric) + kde.fit(X) + y_dens = kde.score_samples(Y) + assert_equal(y_dens.shape, Y.shape[:1]) def test_kde_score(n_samples=100, n_features=3): diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py index a95a906ad3cbd..e1acaa4c6f139 100644 --- a/sklearn/neighbors/tests/test_neighbors.py +++ b/sklearn/neighbors/tests/test_neighbors.py @@ -4,6 +4,8 @@ from scipy.sparse import (bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix, issparse) +import pytest + from sklearn import metrics from sklearn import neighbors, datasets from sklearn.exceptions import DataConversionWarning @@ -1260,63 +1262,57 @@ def test_include_self_neighbors_graph(): assert_array_equal(rng_not_self, [[0., 1.], [1., 0.]]) -def test_same_knn_parallel(): +@pytest.mark.parametrize('algorithm', ALGORITHMS) +def test_same_knn_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) - def check_same_knn_parallel(algorithm): - clf = neighbors.KNeighborsClassifier(n_neighbors=3, - algorithm=algorithm) - clf.fit(X_train, y_train) - y = clf.predict(X_test) - dist, ind = clf.kneighbors(X_test) - graph = clf.kneighbors_graph(X_test, mode='distance').toarray() - - clf.set_params(n_jobs=3) - clf.fit(X_train, y_train) - y_parallel = clf.predict(X_test) - dist_parallel, ind_parallel = clf.kneighbors(X_test) - graph_parallel = \ - clf.kneighbors_graph(X_test, mode='distance').toarray() - - assert_array_equal(y, y_parallel) - assert_array_almost_equal(dist, dist_parallel) - assert_array_equal(ind, ind_parallel) - assert_array_almost_equal(graph, graph_parallel) + clf = neighbors.KNeighborsClassifier(n_neighbors=3, + algorithm=algorithm) + clf.fit(X_train, y_train) + y = clf.predict(X_test) + dist, ind = clf.kneighbors(X_test) + graph = clf.kneighbors_graph(X_test, mode='distance').toarray() - for algorithm in ALGORITHMS: - yield check_same_knn_parallel, algorithm + clf.set_params(n_jobs=3) + clf.fit(X_train, y_train) + y_parallel = clf.predict(X_test) + dist_parallel, ind_parallel = clf.kneighbors(X_test) + graph_parallel = \ + clf.kneighbors_graph(X_test, mode='distance').toarray() + + assert_array_equal(y, y_parallel) + assert_array_almost_equal(dist, dist_parallel) + assert_array_equal(ind, ind_parallel) + assert_array_almost_equal(graph, graph_parallel) -def test_same_radius_neighbors_parallel(): +@pytest.mark.parametrize('algorithm', ALGORITHMS) +def test_same_radius_neighbors_parallel(algorithm): X, y = datasets.make_classification(n_samples=30, n_features=5, n_redundant=0, random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y) - def check_same_radius_neighbors_parallel(algorithm): - clf = neighbors.RadiusNeighborsClassifier(radius=10, - algorithm=algorithm) - clf.fit(X_train, y_train) - y = clf.predict(X_test) - dist, ind = clf.radius_neighbors(X_test) - graph = clf.radius_neighbors_graph(X_test, mode='distance').toarray() - - clf.set_params(n_jobs=3) - clf.fit(X_train, y_train) - y_parallel = clf.predict(X_test) - dist_parallel, ind_parallel = clf.radius_neighbors(X_test) - graph_parallel = \ - clf.radius_neighbors_graph(X_test, mode='distance').toarray() - - assert_array_equal(y, y_parallel) - for i in range(len(dist)): - assert_array_almost_equal(dist[i], dist_parallel[i]) - assert_array_equal(ind[i], ind_parallel[i]) - assert_array_almost_equal(graph, graph_parallel) - - for algorithm in ALGORITHMS: - yield check_same_radius_neighbors_parallel, algorithm + clf = neighbors.RadiusNeighborsClassifier(radius=10, + algorithm=algorithm) + clf.fit(X_train, y_train) + y = clf.predict(X_test) + dist, ind = clf.radius_neighbors(X_test) + graph = clf.radius_neighbors_graph(X_test, mode='distance').toarray() + + clf.set_params(n_jobs=3) + clf.fit(X_train, y_train) + y_parallel = clf.predict(X_test) + dist_parallel, ind_parallel = clf.radius_neighbors(X_test) + graph_parallel = \ + clf.radius_neighbors_graph(X_test, mode='distance').toarray() + + assert_array_equal(y, y_parallel) + for i in range(len(dist)): + assert_array_almost_equal(dist[i], dist_parallel[i]) + assert_array_equal(ind[i], ind_parallel[i]) + assert_array_almost_equal(graph, graph_parallel) def test_dtype_convert(): diff --git a/sklearn/neighbors/tests/test_quad_tree.py b/sklearn/neighbors/tests/test_quad_tree.py index 6cfa4bcc562e2..156bfc232a55d 100644 --- a/sklearn/neighbors/tests/test_quad_tree.py +++ b/sklearn/neighbors/tests/test_quad_tree.py @@ -1,5 +1,8 @@ import pickle import numpy as np + +import pytest + from sklearn.neighbors.quad_tree import _QuadTree from sklearn.utils import check_random_state @@ -58,50 +61,43 @@ def test_quadtree_similar_point(): tree._check_coherence() -def test_quad_tree_pickle(): +@pytest.mark.parametrize('n_dimensions', (2, 3)) +@pytest.mark.parametrize('protocol', (0, 1, 2)) +def test_quad_tree_pickle(n_dimensions, protocol): rng = check_random_state(0) - for n_dimensions in (2, 3): - X = rng.random_sample((10, n_dimensions)) - - tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) - tree.build_tree(X) + X = rng.random_sample((10, n_dimensions)) - def check_pickle_protocol(protocol): - s = pickle.dumps(tree, protocol=protocol) - bt2 = pickle.loads(s) + tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) + tree.build_tree(X) - for x in X: - cell_x_tree = tree.get_cell(x) - cell_x_bt2 = bt2.get_cell(x) - assert cell_x_tree == cell_x_bt2 + s = pickle.dumps(tree, protocol=protocol) + bt2 = pickle.loads(s) - for protocol in (0, 1, 2): - yield check_pickle_protocol, protocol + for x in X: + cell_x_tree = tree.get_cell(x) + cell_x_bt2 = bt2.get_cell(x) + assert cell_x_tree == cell_x_bt2 -def test_qt_insert_duplicate(): +@pytest.mark.parametrize('n_dimensions', (2, 3)) +def test_qt_insert_duplicate(n_dimensions): rng = check_random_state(0) - def check_insert_duplicate(n_dimensions=2): - - X = rng.random_sample((10, n_dimensions)) - Xd = np.r_[X, X[:5]] - tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) - tree.build_tree(Xd) - - cumulative_size = tree.cumulative_size - leafs = tree.leafs + X = rng.random_sample((10, n_dimensions)) + Xd = np.r_[X, X[:5]] + tree = _QuadTree(n_dimensions=n_dimensions, verbose=0) + tree.build_tree(Xd) - # Assert that the first 5 are indeed duplicated and that the next - # ones are single point leaf - for i, x in enumerate(X): - cell_id = tree.get_cell(x) - assert leafs[cell_id] - assert cumulative_size[cell_id] == 1 + (i < 5) + cumulative_size = tree.cumulative_size + leafs = tree.leafs - for n_dimensions in (2, 3): - yield check_insert_duplicate, n_dimensions + # Assert that the first 5 are indeed duplicated and that the next + # ones are single point leaf + for i, x in enumerate(X): + cell_id = tree.get_cell(x) + assert leafs[cell_id] + assert cumulative_size[cell_id] == 1 + (i < 5) def test_summarize(): diff --git a/sklearn/preprocessing/tests/test_label.py b/sklearn/preprocessing/tests/test_label.py index 14788b14b5218..faa0cc3ce275b 100644 --- a/sklearn/preprocessing/tests/test_label.py +++ b/sklearn/preprocessing/tests/test_label.py @@ -482,7 +482,7 @@ def test_label_binarize_binary(): neg_label = -1 expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1)) - yield check_binarized_results, y, classes, pos_label, neg_label, expected + check_binarized_results(y, classes, pos_label, neg_label, expected) # Binary case where sparse_output = True will not result in a ValueError y = [0, 1, 0] @@ -491,7 +491,7 @@ def test_label_binarize_binary(): neg_label = 0 expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1)) - yield check_binarized_results, y, classes, pos_label, neg_label, expected + check_binarized_results(y, classes, pos_label, neg_label, expected) def test_label_binarize_multiclass(): @@ -501,7 +501,7 @@ def test_label_binarize_multiclass(): neg_label = 0 expected = 2 * np.eye(3) - yield check_binarized_results, y, classes, pos_label, neg_label, expected + check_binarized_results(y, classes, pos_label, neg_label, expected) assert_raises(ValueError, label_binarize, y, classes, neg_label=-1, pos_label=pos_label, sparse_output=True) @@ -518,8 +518,8 @@ def test_label_binarize_multilabel(): dok_matrix, lil_matrix]] for y in [y_ind] + y_sparse: - yield (check_binarized_results, y, classes, pos_label, neg_label, - expected) + check_binarized_results(y, classes, pos_label, neg_label, + expected) assert_raises(ValueError, label_binarize, y, classes, neg_label=-1, pos_label=pos_label, sparse_output=True) diff --git a/sklearn/svm/tests/test_bounds.py b/sklearn/svm/tests/test_bounds.py index e46dbb92df44a..d02c53b05d8b7 100644 --- a/sklearn/svm/tests/test_bounds.py +++ b/sklearn/svm/tests/test_bounds.py @@ -1,6 +1,8 @@ import numpy as np from scipy import sparse as sp +import pytest + from sklearn.svm.bounds import l1_min_c from sklearn.svm import LinearSVC from sklearn.linear_model.logistic import LogisticRegression @@ -16,25 +18,24 @@ Y2 = [2, 1, 0, 0] -def test_l1_min_c(): - losses = ['squared_hinge', 'log'] +@pytest.mark.parametrize('loss', ['squared_hinge', 'log']) +@pytest.mark.parametrize('X_label', ['sparse', 'dense']) +@pytest.mark.parametrize('Y_label', ['two-classes', 'multi-class']) +@pytest.mark.parametrize('intercept_label', ['no-intercept', 'fit-intercept']) +def test_l1_min_c(loss, X_label, Y_label, intercept_label): Xs = {'sparse': sparse_X, 'dense': dense_X} Ys = {'two-classes': Y1, 'multi-class': Y2} intercepts = {'no-intercept': {'fit_intercept': False}, 'fit-intercept': {'fit_intercept': True, 'intercept_scaling': 10}} - for loss in losses: - for X_label, X in Xs.items(): - for Y_label, Y in Ys.items(): - for intercept_label, intercept_params in intercepts.items(): - check = lambda: check_l1_min_c(X, Y, loss, - **intercept_params) - check.description = ('Test l1_min_c loss=%r %s %s %s' % - (loss, X_label, Y_label, - intercept_label)) - yield check + X = Xs[X_label] + Y = Ys[Y_label] + intercept_params = intercepts[intercept_label] + check_l1_min_c(X, Y, loss, **intercept_params) + +def test_l1_min_c_l2_loss(): # loss='l2' should raise ValueError assert_raise_message(ValueError, "loss type not in", l1_min_c, dense_X, Y1, "l2") diff --git a/sklearn/tests/test_naive_bayes.py b/sklearn/tests/test_naive_bayes.py index 2f15163d09dda..6b090ce4684f9 100644 --- a/sklearn/tests/test_naive_bayes.py +++ b/sklearn/tests/test_naive_bayes.py @@ -4,6 +4,7 @@ from io import BytesIO import numpy as np import scipy.sparse +import pytest from sklearn.datasets import load_digits, load_iris @@ -177,51 +178,56 @@ def test_discrete_prior(): clf.class_log_prior_, 8) -def test_mnnb(): +@pytest.mark.parametrize('kind', ('dense', 'sparse')) +def test_mnnb(kind): # Test Multinomial Naive Bayes classification. # This checks that MultinomialNB implements fit and predict and returns # correct values for a simple toy dataset. - for X in [X2, scipy.sparse.csr_matrix(X2)]: - # Check the ability to predict the learning set. - clf = MultinomialNB() - assert_raises(ValueError, clf.fit, -X, y2) - y_pred = clf.fit(X, y2).predict(X) + if kind == 'dense': + X = X2 + elif kind == 'sparse': + X = scipy.sparse.csr_matrix(X2) - assert_array_equal(y_pred, y2) + # Check the ability to predict the learning set. + clf = MultinomialNB() + assert_raises(ValueError, clf.fit, -X, y2) + y_pred = clf.fit(X, y2).predict(X) + + assert_array_equal(y_pred, y2) - # Verify that np.log(clf.predict_proba(X)) gives the same results as - # clf.predict_log_proba(X) - y_pred_proba = clf.predict_proba(X) - y_pred_log_proba = clf.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) + # Verify that np.log(clf.predict_proba(X)) gives the same results as + # clf.predict_log_proba(X) + y_pred_proba = clf.predict_proba(X) + y_pred_log_proba = clf.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) - # Check that incremental fitting yields the same results - clf2 = MultinomialNB() - clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2)) - clf2.partial_fit(X[2:5], y2[2:5]) - clf2.partial_fit(X[5:], y2[5:]) + # Check that incremental fitting yields the same results + clf2 = MultinomialNB() + clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2)) + clf2.partial_fit(X[2:5], y2[2:5]) + clf2.partial_fit(X[5:], y2[5:]) - y_pred2 = clf2.predict(X) - assert_array_equal(y_pred2, y2) + y_pred2 = clf2.predict(X) + assert_array_equal(y_pred2, y2) - y_pred_proba2 = clf2.predict_proba(X) - y_pred_log_proba2 = clf2.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8) - assert_array_almost_equal(y_pred_proba2, y_pred_proba) - assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba) + y_pred_proba2 = clf2.predict_proba(X) + y_pred_log_proba2 = clf2.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8) + assert_array_almost_equal(y_pred_proba2, y_pred_proba) + assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba) - # Partial fit on the whole data at once should be the same as fit too - clf3 = MultinomialNB() - clf3.partial_fit(X, y2, classes=np.unique(y2)) + # Partial fit on the whole data at once should be the same as fit too + clf3 = MultinomialNB() + clf3.partial_fit(X, y2, classes=np.unique(y2)) - y_pred3 = clf3.predict(X) - assert_array_equal(y_pred3, y2) - y_pred_proba3 = clf3.predict_proba(X) - y_pred_log_proba3 = clf3.predict_log_proba(X) - assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8) - assert_array_almost_equal(y_pred_proba3, y_pred_proba) - assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba) + y_pred3 = clf3.predict(X) + assert_array_equal(y_pred3, y2) + y_pred_proba3 = clf3.predict_proba(X) + y_pred_log_proba3 = clf3.predict_log_proba(X) + assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8) + assert_array_almost_equal(y_pred_proba3, y_pred_proba) + assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba) def check_partial_fit(cls): @@ -240,9 +246,9 @@ def check_partial_fit(cls): assert_array_equal(clf1.feature_count_, clf3.feature_count_) -def test_discretenb_partial_fit(): - for cls in [MultinomialNB, BernoulliNB]: - yield check_partial_fit, cls +@pytest.mark.parametrize("cls", [MultinomialNB, BernoulliNB]) +def test_discretenb_partial_fit(cls): + check_partial_fit(cls) def test_gnb_partial_fit(): @@ -259,62 +265,63 @@ def test_gnb_partial_fit(): assert_array_almost_equal(clf.class_prior_, clf_pf2.class_prior_) -def test_discretenb_pickle(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB]) +def test_discretenb_pickle(cls): # Test picklability of discrete naive Bayes classifiers - for cls in [BernoulliNB, MultinomialNB, GaussianNB]: - clf = cls().fit(X2, y2) - y_pred = clf.predict(X2) + clf = cls().fit(X2, y2) + y_pred = clf.predict(X2) - store = BytesIO() - pickle.dump(clf, store) - clf = pickle.load(BytesIO(store.getvalue())) + store = BytesIO() + pickle.dump(clf, store) + clf = pickle.load(BytesIO(store.getvalue())) - assert_array_equal(y_pred, clf.predict(X2)) + assert_array_equal(y_pred, clf.predict(X2)) - if cls is not GaussianNB: - # TODO re-enable me when partial_fit is implemented for GaussianNB + if cls is not GaussianNB: + # TODO re-enable me when partial_fit is implemented for GaussianNB - # Test pickling of estimator trained with partial_fit - clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2)) - clf2.partial_fit(X2[3:], y2[3:]) - store = BytesIO() - pickle.dump(clf2, store) - clf2 = pickle.load(BytesIO(store.getvalue())) - assert_array_equal(y_pred, clf2.predict(X2)) + # Test pickling of estimator trained with partial_fit + clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2)) + clf2.partial_fit(X2[3:], y2[3:]) + store = BytesIO() + pickle.dump(clf2, store) + clf2 = pickle.load(BytesIO(store.getvalue())) + assert_array_equal(y_pred, clf2.predict(X2)) -def test_input_check_fit(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB, GaussianNB]) +def test_input_check_fit(cls): # Test input checks for the fit method - for cls in [BernoulliNB, MultinomialNB, GaussianNB]: - # check shape consistency for number of samples at fit time - assert_raises(ValueError, cls().fit, X2, y2[:-1]) - # check shape consistency for number of input features at predict time - clf = cls().fit(X2, y2) - assert_raises(ValueError, clf.predict, X2[:, :-1]) + # check shape consistency for number of samples at fit time + assert_raises(ValueError, cls().fit, X2, y2[:-1]) + # check shape consistency for number of input features at predict time + clf = cls().fit(X2, y2) + assert_raises(ValueError, clf.predict, X2[:, :-1]) -def test_input_check_partial_fit(): - for cls in [BernoulliNB, MultinomialNB]: - # check shape consistency - assert_raises(ValueError, cls().partial_fit, X2, y2[:-1], - classes=np.unique(y2)) - # classes is required for first call to partial fit - assert_raises(ValueError, cls().partial_fit, X2, y2) +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_input_check_partial_fit(cls): + # check shape consistency + assert_raises(ValueError, cls().partial_fit, X2, y2[:-1], + classes=np.unique(y2)) + + # classes is required for first call to partial fit + assert_raises(ValueError, cls().partial_fit, X2, y2) - # check consistency of consecutive classes values - clf = cls() - clf.partial_fit(X2, y2, classes=np.unique(y2)) - assert_raises(ValueError, clf.partial_fit, X2, y2, - classes=np.arange(42)) + # check consistency of consecutive classes values + clf = cls() + clf.partial_fit(X2, y2, classes=np.unique(y2)) + assert_raises(ValueError, clf.partial_fit, X2, y2, + classes=np.arange(42)) - # check consistency of input shape for partial_fit - assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2) + # check consistency of input shape for partial_fit + assert_raises(ValueError, clf.partial_fit, X2[:, :-1], y2) - # check consistency of input shape for predict - assert_raises(ValueError, clf.predict, X2[:, :-1]) + # check consistency of input shape for predict + assert_raises(ValueError, clf.predict, X2[:, :-1]) def test_discretenb_predict_proba(): @@ -348,34 +355,35 @@ def test_discretenb_predict_proba(): assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1) -def test_discretenb_uniform_prior(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_discretenb_uniform_prior(cls): # Test whether discrete NB classes fit a uniform prior # when fit_prior=False and class_prior=None - for cls in [BernoulliNB, MultinomialNB]: - clf = cls() - clf.set_params(fit_prior=False) - clf.fit([[0], [0], [1]], [0, 0, 1]) - prior = np.exp(clf.class_log_prior_) - assert_array_almost_equal(prior, np.array([.5, .5])) + clf = cls() + clf.set_params(fit_prior=False) + clf.fit([[0], [0], [1]], [0, 0, 1]) + prior = np.exp(clf.class_log_prior_) + assert_array_almost_equal(prior, np.array([.5, .5])) -def test_discretenb_provide_prior(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_discretenb_provide_prior(cls): # Test whether discrete NB classes use provided prior - for cls in [BernoulliNB, MultinomialNB]: - clf = cls(class_prior=[0.5, 0.5]) - clf.fit([[0], [0], [1]], [0, 0, 1]) - prior = np.exp(clf.class_log_prior_) - assert_array_almost_equal(prior, np.array([.5, .5])) + clf = cls(class_prior=[0.5, 0.5]) + clf.fit([[0], [0], [1]], [0, 0, 1]) + prior = np.exp(clf.class_log_prior_) + assert_array_almost_equal(prior, np.array([.5, .5])) - # Inconsistent number of classes with prior - assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2]) - assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1], - classes=[0, 1, 1]) + # Inconsistent number of classes with prior + assert_raises(ValueError, clf.fit, [[0], [1], [2]], [0, 1, 2]) + assert_raises(ValueError, clf.partial_fit, [[0], [1]], [0, 1], + classes=[0, 1, 1]) -def test_discretenb_provide_prior_with_partial_fit(): +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_discretenb_provide_prior_with_partial_fit(cls): # Test whether discrete NB classes use provided prior # when using partial_fit @@ -383,22 +391,21 @@ def test_discretenb_provide_prior_with_partial_fit(): iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split( iris.data, iris.target, test_size=0.4, random_state=415) - for cls in [BernoulliNB, MultinomialNB]: - for prior in [None, [0.3, 0.3, 0.4]]: - clf_full = cls(class_prior=prior) - clf_full.fit(iris.data, iris.target) - clf_partial = cls(class_prior=prior) - clf_partial.partial_fit(iris_data1, iris_target1, - classes=[0, 1, 2]) - clf_partial.partial_fit(iris_data2, iris_target2) - assert_array_almost_equal(clf_full.class_log_prior_, - clf_partial.class_log_prior_) - - -def test_sample_weight_multiclass(): - for cls in [BernoulliNB, MultinomialNB]: - # check shape consistency for number of samples at fit time - yield check_sample_weight_multiclass, cls + for prior in [None, [0.3, 0.3, 0.4]]: + clf_full = cls(class_prior=prior) + clf_full.fit(iris.data, iris.target) + clf_partial = cls(class_prior=prior) + clf_partial.partial_fit(iris_data1, iris_target1, + classes=[0, 1, 2]) + clf_partial.partial_fit(iris_data2, iris_target2) + assert_array_almost_equal(clf_full.class_log_prior_, + clf_partial.class_log_prior_) + + +@pytest.mark.parametrize('cls', [BernoulliNB, MultinomialNB]) +def test_sample_weight_multiclass(cls): + # check shape consistency for number of samples at fit time + check_sample_weight_multiclass(cls) def check_sample_weight_multiclass(cls): diff --git a/sklearn/tests/test_random_projection.py b/sklearn/tests/test_random_projection.py index dcbe97c7d6d7f..975922a341163 100644 --- a/sklearn/tests/test_random_projection.py +++ b/sklearn/tests/test_random_projection.py @@ -1,7 +1,10 @@ from __future__ import division +import functools + import numpy as np import scipy.sparse as sp +import pytest from sklearn.metrics import euclidean_distances @@ -113,21 +116,21 @@ def check_input_with_sparse_random_matrix(random_matrix): random_matrix, n_components, n_features, density=density) -def test_basic_property_of_random_matrix(): +@pytest.mark.parametrize("random_matrix", all_random_matrix) +def test_basic_property_of_random_matrix(random_matrix): # Check basic properties of random matrix generation - for random_matrix in all_random_matrix: - yield check_input_size_random_matrix, random_matrix - yield check_size_generated, random_matrix - yield check_zero_mean_and_unit_norm, random_matrix - - for random_matrix in all_sparse_random_matrix: - yield check_input_with_sparse_random_matrix, random_matrix - - random_matrix_dense = \ - lambda n_components, n_features, random_state: random_matrix( - n_components, n_features, random_state=random_state, - density=1.0) - yield check_zero_mean_and_unit_norm, random_matrix_dense + check_input_size_random_matrix(random_matrix) + check_size_generated(random_matrix) + check_zero_mean_and_unit_norm(random_matrix) + + +@pytest.mark.parametrize("random_matrix", all_sparse_random_matrix) +def test_basic_property_of_sparse_random_matrix(random_matrix): + check_input_with_sparse_random_matrix(random_matrix) + + random_matrix_dense = functools.partial(random_matrix, density=1.0) + + check_zero_mean_and_unit_norm(random_matrix_dense) def test_gaussian_random_matrix(): diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index f85493543b1ef..bb117d8a29863 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -7,6 +7,7 @@ from itertools import product import struct +import pytest import numpy as np from scipy.sparse import csc_matrix from scipy.sparse import csr_matrix @@ -701,14 +702,14 @@ def check_min_weight_fraction_leaf(name, datasets, sparse=False): name, est.min_weight_fraction_leaf)) -def test_min_weight_fraction_leaf(): - # Check on dense input - for name in ALL_TREES: - yield check_min_weight_fraction_leaf, name, "iris" +@pytest.mark.parametrize("name", ALL_TREES) +def test_min_weight_fraction_leaf_on_dense_input(name): + check_min_weight_fraction_leaf(name, "iris") - # Check on sparse input - for name in SPARSE_TREES: - yield check_min_weight_fraction_leaf, name, "multilabel", True + +@pytest.mark.parametrize("name", SPARSE_TREES) +def test_min_weight_fraction_leaf_on_sparse_input(name): + check_min_weight_fraction_leaf(name, "multilabel", True) def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets, @@ -775,16 +776,15 @@ def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets, est.min_samples_leaf)) -def test_min_weight_fraction_leaf_with_min_samples_leaf(): - # Check on dense input - for name in ALL_TREES: - yield (check_min_weight_fraction_leaf_with_min_samples_leaf, - name, "iris") +@pytest.mark.parametrize("name", ALL_TREES) +def test_min_weight_fraction_leaf_with_min_samples_leaf_on_dense_input(name): + check_min_weight_fraction_leaf_with_min_samples_leaf(name, "iris") + - # Check on sparse input - for name in SPARSE_TREES: - yield (check_min_weight_fraction_leaf_with_min_samples_leaf, - name, "multilabel", True) +@pytest.mark.parametrize("name", SPARSE_TREES) +def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input(name): + check_min_weight_fraction_leaf_with_min_samples_leaf( + name, "multilabel", True) def test_min_impurity_split(): @@ -1178,9 +1178,9 @@ def check_class_weights(name): assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_) -def test_class_weights(): - for name in CLF_TREES: - yield check_class_weights, name +@pytest.mark.parametrize("name", CLF_TREES) +def test_class_weights(name): + check_class_weights(name) def check_class_weight_errors(name): @@ -1202,9 +1202,9 @@ def check_class_weight_errors(name): assert_raises(ValueError, clf.fit, X, _y) -def test_class_weight_errors(): - for name in CLF_TREES: - yield check_class_weight_errors, name +@pytest.mark.parametrize("name", CLF_TREES) +def test_class_weight_errors(name): + check_class_weight_errors(name) def test_max_leaf_nodes(): @@ -1364,20 +1364,25 @@ def check_sparse_input(tree, dataset, max_depth=None): y_log_proba) -def test_sparse_input(): - for tree_type, dataset in product(SPARSE_TREES, ("clf_small", "toy", - "digits", "multilabel", - "sparse-pos", - "sparse-neg", - "sparse-mix", "zeros")): - max_depth = 3 if dataset == "digits" else None - yield (check_sparse_input, tree_type, dataset, max_depth) +@pytest.mark.parametrize("tree_type", SPARSE_TREES) +@pytest.mark.parametrize( + "dataset", + ("clf_small", "toy", "digits", "multilabel", + "sparse-pos", "sparse-neg", "sparse-mix", + "zeros") +) +def test_sparse_input(tree_type, dataset): + max_depth = 3 if dataset == "digits" else None + check_sparse_input(tree_type, dataset, max_depth) + +@pytest.mark.parametrize("tree_type", + set(SPARSE_TREES).intersection(REG_TREES)) +@pytest.mark.parametrize("dataset", ["boston", "reg_small"]) +def test_sparse_input_reg_trees(tree_type, dataset): # Due to numerical instability of MSE and too strict test, we limit the # maximal depth - for tree_type, dataset in product(SPARSE_TREES, ["boston", "reg_small"]): - if tree_type in REG_TREES: - yield (check_sparse_input, tree_type, dataset, 2) + check_sparse_input(tree_type, dataset, 2) def check_sparse_parameters(tree, dataset): @@ -1424,13 +1429,6 @@ def check_sparse_parameters(tree, dataset): assert_array_almost_equal(s.predict(X), d.predict(X)) -def test_sparse_parameters(): - for tree_type, dataset in product(SPARSE_TREES, ["sparse-pos", - "sparse-neg", - "sparse-mix", "zeros"]): - yield (check_sparse_parameters, tree_type, dataset) - - def check_sparse_criterion(tree, dataset): TreeEstimator = ALL_TREES[tree] X = DATASETS[dataset]["X"] @@ -1451,11 +1449,13 @@ def check_sparse_criterion(tree, dataset): assert_array_almost_equal(s.predict(X), d.predict(X)) -def test_sparse_criterion(): - for tree_type, dataset in product(SPARSE_TREES, ["sparse-pos", - "sparse-neg", - "sparse-mix", "zeros"]): - yield (check_sparse_criterion, tree_type, dataset) +@pytest.mark.parametrize("tree_type", SPARSE_TREES) +@pytest.mark.parametrize("dataset", + ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"]) +@pytest.mark.parametrize("check", + [check_sparse_parameters, check_sparse_criterion]) +def test_sparse(tree_type, dataset, check): + check(tree_type, dataset) def check_explicit_sparse_zeros(tree, max_depth=3, @@ -1527,9 +1527,9 @@ def check_explicit_sparse_zeros(tree, max_depth=3, d.predict_proba(X2)) -def test_explicit_sparse_zeros(): - for tree_type in SPARSE_TREES: - yield (check_explicit_sparse_zeros, tree_type) +@pytest.mark.parametrize("tree_type", SPARSE_TREES) +def test_explicit_sparse_zeros(tree_type): + check_explicit_sparse_zeros(tree_type) @ignore_warnings @@ -1547,10 +1547,10 @@ def check_raise_error_on_1d_input(name): assert_raises(ValueError, est.predict, [X]) -@ignore_warnings -def test_1d_input(): - for name in ALL_TREES: - yield check_raise_error_on_1d_input, name +@pytest.mark.parametrize("name", ALL_TREES) +def test_1d_input(name): + with ignore_warnings(): + check_raise_error_on_1d_input(name) def _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight): @@ -1576,9 +1576,9 @@ def check_min_weight_leaf_split_level(name): sample_weight) -def test_min_weight_leaf_split_level(): - for name in ALL_TREES: - yield check_min_weight_leaf_split_level, name +@pytest.mark.parametrize("name", ALL_TREES) +def test_min_weight_leaf_split_level(name): + check_min_weight_leaf_split_level(name) def check_public_apply(name): @@ -1599,12 +1599,14 @@ def check_public_apply_sparse(name): est.tree_.apply(X_small32)) -def test_public_apply(): - for name in ALL_TREES: - yield (check_public_apply, name) +@pytest.mark.parametrize("name", ALL_TREES) +def test_public_apply_all_trees(name): + check_public_apply(name) - for name in SPARSE_TREES: - yield (check_public_apply_sparse, name) + +@pytest.mark.parametrize("name", SPARSE_TREES) +def test_public_apply_sparse_trees(name): + check_public_apply_sparse(name) def check_presort_sparse(est, X, y): @@ -1623,19 +1625,18 @@ def test_presort_sparse(): y = y[:, 0] for est, sparse_matrix in product(ests, sparse_matrices): - yield check_presort_sparse, est, sparse_matrix(X), y + check_presort_sparse(est, sparse_matrix(X), y) -def test_invalid_presort(): - classes = (DecisionTreeRegressor, DecisionTreeClassifier) +@pytest.mark.parametrize('cls', + (DecisionTreeRegressor, DecisionTreeClassifier)) +def test_invalid_presort(cls): allowed_presort = ('auto', True, False) invalid_presort = 'invalid' msg = ("'presort' should be in {}. " "Got {!r} instead.".format(allowed_presort, invalid_presort)) - for cls in classes: - est = cls(presort=invalid_presort) - assert_raise_message(ValueError, msg, - est.fit, X, y) + est = cls(presort=invalid_presort) + assert_raise_message(ValueError, msg, est.fit, X, y) def test_decision_path_hardcoded(): @@ -1674,9 +1675,9 @@ def check_decision_path(name): assert_less_equal(est.tree_.max_depth, max_depth) -def test_decision_path(): - for name in ALL_TREES: - yield (check_decision_path, name) +@pytest.mark.parametrize("name", ALL_TREES) +def test_decision_path(name): + check_decision_path(name) def check_no_sparse_y_support(name): @@ -1685,10 +1686,10 @@ def check_no_sparse_y_support(name): assert_raises(TypeError, TreeEstimator(random_state=0).fit, X, y) -def test_no_sparse_y_support(): +@pytest.mark.parametrize("name", ALL_TREES) +def test_no_sparse_y_support(name): # Currently we don't support sparse y - for name in ALL_TREES: - yield (check_no_sparse_y_support, name) + check_no_sparse_y_support(name) def test_mae(): diff --git a/sklearn/utils/tests/test_extmath.py b/sklearn/utils/tests/test_extmath.py index f53b814c70084..d89e2a1aa1223 100644 --- a/sklearn/utils/tests/test_extmath.py +++ b/sklearn/utils/tests/test_extmath.py @@ -9,6 +9,8 @@ from scipy import linalg from scipy import stats +import pytest + from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_almost_equal from sklearn.utils.testing import assert_array_equal @@ -170,9 +172,10 @@ def check_randomized_svd_low_rank(dtype): assert_almost_equal(s[:rank], sa[:rank], decimal=decimal) -def test_randomized_svd_low_rank_all_dtypes(): - for dtype in (np.int32, np.int64, np.float32, np.float64): - yield check_randomized_svd_low_rank, dtype +@pytest.mark.parametrize('dtype', + (np.int32, np.int64, np.float32, np.float64)) +def test_randomized_svd_low_rank_all_dtypes(dtype): + check_randomized_svd_low_rank(dtype) @ignore_warnings # extmath.norm is deprecated to be removed in 0.21 @@ -191,34 +194,35 @@ def test_norm_squared_norm(): squared_norm, X.astype(int)) -def test_row_norms(): +@pytest.mark.parametrize('dtype', + (np.float32, np.float64)) +def test_row_norms(dtype): X = np.random.RandomState(42).randn(100, 100) - for dtype in (np.float32, np.float64): - if dtype is np.float32: - precision = 4 - else: - precision = 5 - - X = X.astype(dtype) - sq_norm = (X ** 2).sum(axis=1) - - assert_array_almost_equal(sq_norm, row_norms(X, squared=True), + if dtype is np.float32: + precision = 4 + else: + precision = 5 + + X = X.astype(dtype) + sq_norm = (X ** 2).sum(axis=1) + + assert_array_almost_equal(sq_norm, row_norms(X, squared=True), + precision) + assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision) + + for csr_index_dtype in [np.int32, np.int64]: + Xcsr = sparse.csr_matrix(X, dtype=dtype) + # csr_matrix will use int32 indices by default, + # up-casting those to int64 when necessary + if csr_index_dtype is np.int64: + Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype) + Xcsr.indices = Xcsr.indices.astype(csr_index_dtype) + assert Xcsr.indices.dtype == csr_index_dtype + assert Xcsr.indptr.dtype == csr_index_dtype + assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), + precision) + assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision) - assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision) - - for csr_index_dtype in [np.int32, np.int64]: - Xcsr = sparse.csr_matrix(X, dtype=dtype) - # csr_matrix will use int32 indices by default, - # up-casting those to int64 when necessary - if csr_index_dtype is np.int64: - Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype) - Xcsr.indices = Xcsr.indices.astype(csr_index_dtype) - assert Xcsr.indices.dtype == csr_index_dtype - assert Xcsr.indptr.dtype == csr_index_dtype - assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), - precision) - assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), - precision) def test_randomized_svd_low_rank_with_noise(): diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py index fbd05031c87b3..36e3bf72b609b 100644 --- a/sklearn/utils/tests/test_stats.py +++ b/sklearn/utils/tests/test_stats.py @@ -1,3 +1,4 @@ +import pytest from sklearn.utils.testing import assert_array_equal, ignore_warnings from sklearn.utils.stats import rankdata @@ -13,12 +14,10 @@ ) -@ignore_warnings # Test deprecated backport to be removed in 0.21 -def test_cases(): +@pytest.mark.parametrize("values, method, expected", _cases) +def test_cases_rankdata(values, method, expected): - def check_case(values, method, expected): + # Test deprecated backport to be removed in 0.21 + with ignore_warnings(): r = rankdata(values, method=method) assert_array_equal(r, expected) - - for values, method, expected in _cases: - yield check_case, values, method, expected