Skip to content

MAINT More test runtime optimizations #14136

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions sklearn/feature_extraction/tests/test_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def test_connect_regions():
# Newer versions of scipy have face in misc
from scipy import misc
face = misc.face(gray=True)
# subsample by 4 to reduce run time
face = face[::4, ::4]
for thr in (50, 150):
mask = face > thr
graph = img_to_graph(face, mask)
Expand All @@ -77,6 +79,10 @@ def test_connect_regions_with_grid():
# Newer versions of scipy have face in misc
from scipy import misc
face = misc.face(gray=True)

# subsample by 4 to reduce run time
face = face[::4, ::4]

mask = face > 50
graph = grid_to_graph(*face.shape, mask=mask)
assert_equal(ndimage.label(mask)[1], connected_components(graph)[0])
Expand Down
34 changes: 17 additions & 17 deletions sklearn/feature_selection/tests/test_feature_select.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
from scipy import stats, sparse

import pytest

from sklearn.utils.testing import assert_equal
from sklearn.utils.testing import assert_almost_equal
from sklearn.utils.testing import assert_raises
Expand All @@ -16,8 +18,6 @@
from sklearn.utils.testing import assert_warns
from sklearn.utils.testing import ignore_warnings
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import assert_greater
from sklearn.utils.testing import assert_greater_equal
from sklearn.utils import safe_mask

from sklearn.datasets.samples_generator import (make_classification,
Expand Down Expand Up @@ -408,7 +408,9 @@ def test_boundary_case_ch2():
assert_array_equal(support_fwe, np.array([True, False]))


def test_select_fdr_regression():
@pytest.mark.parametrize("alpha", [0.001, 0.01, 0.1])
@pytest.mark.parametrize("n_informative", [1, 5, 10])
def test_select_fdr_regression(alpha, n_informative):
# Test that fdr heuristic actually has low FDR.
def single_fdr(alpha, n_informative, random_state):
X, y = make_regression(n_samples=150, n_features=20,
Expand All @@ -434,20 +436,18 @@ def single_fdr(alpha, n_informative, random_state):
(num_true_positives + num_false_positives))
return false_discovery_rate

for alpha in [0.001, 0.01, 0.1]:
for n_informative in [1, 5, 10]:
# As per Benjamini-Hochberg, the expected false discovery rate
# should be lower than alpha:
# FDR = E(FP / (TP + FP)) <= alpha
false_discovery_rate = np.mean([single_fdr(alpha, n_informative,
random_state) for
random_state in range(100)])
assert_greater_equal(alpha, false_discovery_rate)

# Make sure that the empirical false discovery rate increases
# with alpha:
if false_discovery_rate != 0:
assert_greater(false_discovery_rate, alpha / 10)
# As per Benjamini-Hochberg, the expected false discovery rate
# should be lower than alpha:
# FDR = E(FP / (TP + FP)) <= alpha
false_discovery_rate = np.mean([single_fdr(alpha, n_informative,
random_state) for
random_state in range(100)])
assert alpha >= false_discovery_rate

# Make sure that the empirical false discovery rate increases
# with alpha:
if false_discovery_rate != 0:
assert false_discovery_rate > alpha / 10


def test_select_fwe_regression():
Expand Down
4 changes: 2 additions & 2 deletions sklearn/gaussian_process/tests/test_gpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,12 @@ def test_random_starts():
@pytest.mark.parametrize('kernel', non_fixed_kernels)
def test_custom_optimizer(kernel):
# Test that GPC can use externally defined optimizers.
# Define a dummy optimizer that simply tests 50 random hyperparameters
# Define a dummy optimizer that simply tests 10 random hyperparameters
def optimizer(obj_func, initial_theta, bounds):
rng = np.random.RandomState(0)
theta_opt, func_min = \
initial_theta, obj_func(initial_theta, eval_gradient=False)
for _ in range(50):
for _ in range(10):
theta = np.atleast_1d(rng.uniform(np.maximum(-2, bounds[:, 0]),
np.minimum(1, bounds[:, 1])))
f = obj_func(theta, eval_gradient=False)
Expand Down
12 changes: 6 additions & 6 deletions sklearn/impute/tests/test_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,7 +796,7 @@ def test_iterative_imputer_no_missing():

def test_iterative_imputer_rank_one():
rng = np.random.RandomState(0)
d = 100
d = 50
A = rng.rand(d, 1)
B = rng.rand(1, d)
X = np.dot(A, B)
Expand All @@ -808,7 +808,7 @@ def test_iterative_imputer_rank_one():
verbose=1,
random_state=rng)
X_filled = imputer.fit_transform(X_missing)
assert_allclose(X_filled, X, atol=0.01)
assert_allclose(X_filled, X, atol=0.02)


@pytest.mark.parametrize(
Expand All @@ -817,8 +817,8 @@ def test_iterative_imputer_rank_one():
)
def test_iterative_imputer_transform_recovery(rank):
rng = np.random.RandomState(0)
n = 100
d = 100
n = 70
d = 70
A = rng.rand(n, rank)
B = rng.rand(rank, d)
X_filled = np.dot(A, B)
Expand All @@ -832,7 +832,7 @@ def test_iterative_imputer_transform_recovery(rank):
X_test_filled = X_filled[n:]
X_test = X_missing[n:]

imputer = IterativeImputer(max_iter=10,
imputer = IterativeImputer(max_iter=5,
verbose=1,
random_state=rng).fit(X_train)
X_test_est = imputer.transform(X_test)
Expand Down Expand Up @@ -890,7 +890,7 @@ def test_iterative_imputer_early_stopping():
X_missing[nan_mask] = np.nan

imputer = IterativeImputer(max_iter=100,
tol=1e-3,
tol=1e-2,
sample_posterior=False,
verbose=1,
random_state=rng)
Expand Down
11 changes: 7 additions & 4 deletions sklearn/inspection/tests/test_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,15 @@


# (X, y), n_targets <-- as expected in the output of partial_dep()
binary_classification_data = (make_classification(random_state=0), 1)
multiclass_classification_data = (make_classification(n_classes=3,
binary_classification_data = (make_classification(n_samples=50,
random_state=0), 1)
multiclass_classification_data = (make_classification(n_samples=50,
n_classes=3,
n_clusters_per_class=1,
random_state=0), 3)
regression_data = (make_regression(random_state=0), 1)
multioutput_regression_data = (make_regression(n_targets=2, random_state=0), 2)
regression_data = (make_regression(n_samples=50, random_state=0), 1)
multioutput_regression_data = (make_regression(n_samples=50, n_targets=2,
random_state=0), 2)


@pytest.mark.parametrize('Estimator, method, data', [
Expand Down
61 changes: 29 additions & 32 deletions sklearn/manifold/tests/test_t_sne.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def test_binary_search_neighbors():
# Binary perplexity search approximation.
# Should be approximately equal to the slow method when we use
# all points as neighbors.
n_samples = 500
n_samples = 200
desired_perplexity = 25.0
random_state = check_random_state(0)
distances = random_state.randn(n_samples, 2).astype(np.float32)
Expand Down Expand Up @@ -239,21 +239,18 @@ def test_trustworthiness():
assert_almost_equal(trustworthiness(X, X_embedded, n_neighbors=1), 0.2)


def test_preserve_trustworthiness_approximately():
@pytest.mark.parametrize("method", ['exact', 'barnes_hut'])
@pytest.mark.parametrize("init", ('random', 'pca'))
def test_preserve_trustworthiness_approximately(method, init):
# Nearest neighbors should be preserved approximately.
random_state = check_random_state(0)
n_components = 2
methods = ['exact', 'barnes_hut']
X = random_state.randn(50, n_components).astype(np.float32)
for init in ('random', 'pca'):
for method in methods:
tsne = TSNE(n_components=n_components, init=init, random_state=0,
method=method)
X_embedded = tsne.fit_transform(X)
t = trustworthiness(X, X_embedded, n_neighbors=1)
assert_greater(t, 0.85, msg='Trustworthiness={:0.3f} < 0.85 '
'for method={} and '
'init={}'.format(t, method, init))
tsne = TSNE(n_components=n_components, init=init, random_state=0,
method=method, n_iter=700)
X_embedded = tsne.fit_transform(X)
t = trustworthiness(X, X_embedded, n_neighbors=1)
assert t > 0.85


def test_optimization_minimizes_kl_divergence():
Expand All @@ -273,11 +270,11 @@ def test_optimization_minimizes_kl_divergence():
def test_fit_csr_matrix():
# X can be a sparse matrix.
random_state = check_random_state(0)
X = random_state.randn(100, 2)
X[(np.random.randint(0, 100, 50), np.random.randint(0, 2, 50))] = 0.0
X = random_state.randn(50, 2)
X[(np.random.randint(0, 50, 25), np.random.randint(0, 2, 25))] = 0.0
X_csr = sp.csr_matrix(X)
tsne = TSNE(n_components=2, perplexity=10, learning_rate=100.0,
random_state=0, method='exact')
random_state=0, method='exact', n_iter=500)
X_embedded = tsne.fit_transform(X_csr)
assert_almost_equal(trustworthiness(X_csr, X_embedded, n_neighbors=1), 1.0,
decimal=1)
Expand All @@ -287,11 +284,11 @@ def test_preserve_trustworthiness_approximately_with_precomputed_distances():
# Nearest neighbors should be preserved approximately.
random_state = check_random_state(0)
for i in range(3):
X = random_state.randn(100, 2)
X = random_state.randn(80, 2)
D = squareform(pdist(X), "sqeuclidean")
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
early_exaggeration=2.0, metric="precomputed",
random_state=i, verbose=0)
random_state=i, verbose=0, n_iter=500)
X_embedded = tsne.fit_transform(D)
t = trustworthiness(D, X_embedded, n_neighbors=1, metric="precomputed")
assert t > .95
Expand Down Expand Up @@ -420,11 +417,11 @@ def test_early_exaggeration_used():
for method in methods:
tsne = TSNE(n_components=n_components, perplexity=1,
learning_rate=100.0, init="pca", random_state=0,
method=method, early_exaggeration=1.0)
method=method, early_exaggeration=1.0, n_iter=250)
X_embedded1 = tsne.fit_transform(X)
tsne = TSNE(n_components=n_components, perplexity=1,
learning_rate=100.0, init="pca", random_state=0,
method=method, early_exaggeration=10.0)
method=method, early_exaggeration=10.0, n_iter=250)
X_embedded2 = tsne.fit_transform(X)

assert not np.allclose(X_embedded1, X_embedded2)
Expand Down Expand Up @@ -586,9 +583,10 @@ def test_64bit(method, dt):
# Ensure 64bit arrays are handled correctly.
random_state = check_random_state(0)

X = random_state.randn(50, 2).astype(dt, copy=False)
X = random_state.randn(10, 2).astype(dt, copy=False)
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
random_state=0, method=method, verbose=0)
random_state=0, method=method, verbose=0,
n_iter=300)
X_embedded = tsne.fit_transform(X)
effective_type = X_embedded.dtype

Expand All @@ -605,7 +603,7 @@ def test_kl_divergence_not_nan(method):

X = random_state.randn(50, 2)
tsne = TSNE(n_components=2, perplexity=2, learning_rate=100.0,
random_state=0, method=method, verbose=0, n_iter=1003)
random_state=0, method=method, verbose=0, n_iter=503)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We just want to be sure that tsne.kl_divergence_ is computed when n_iter % n_iter_check != 0 cf comment in the code above.

tsne.fit_transform(X)

assert not np.isnan(tsne.kl_divergence_)
Expand Down Expand Up @@ -722,9 +720,10 @@ def test_min_grad_norm():
def test_accessible_kl_divergence():
# Ensures that the accessible kl_divergence matches the computed value
random_state = check_random_state(0)
X = random_state.randn(100, 2)
X = random_state.randn(50, 2)
tsne = TSNE(n_iter_without_progress=2, verbose=2,
random_state=0, method='exact')
random_state=0, method='exact',
n_iter=500)

old_stdout = sys.stdout
sys.stdout = StringIO()
Expand All @@ -746,7 +745,8 @@ def test_accessible_kl_divergence():
assert_almost_equal(tsne.kl_divergence_, float(error), decimal=5)


def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000):
@pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
def test_uniform_grid(method):
"""Make sure that TSNE can approximately recover a uniform 2D grid

Due to ties in distances between point in X_2d_grid, this test is platform
Expand All @@ -758,6 +758,8 @@ def check_uniform_grid(method, seeds=[0, 1, 2], n_iter=1000):
we re-run t-SNE from the final point when the convergence is not good
enough.
"""
seeds = [0, 1, 2]
n_iter = 500
for seed in seeds:
tsne = TSNE(n_components=2, init='random', random_state=seed,
perplexity=20, n_iter=n_iter, method=method)
Expand Down Expand Up @@ -791,11 +793,6 @@ def assert_uniform_grid(Y, try_name=None):
assert_less(largest_to_mean, 2, msg=try_name)


@pytest.mark.parametrize('method', ['barnes_hut', 'exact'])
def test_uniform_grid(method):
check_uniform_grid(method)


def test_bh_match_exact():
# check that the ``barnes_hut`` method match the exact one when
# ``angle = 0`` and ``perplexity > n_samples / 3``
Expand Down Expand Up @@ -829,8 +826,8 @@ def test_tsne_with_different_distance_metrics():
for metric, dist_func in zip(metrics, dist_funcs):
X_transformed_tsne = TSNE(
metric=metric, n_components=n_components_embedding,
random_state=0).fit_transform(X)
random_state=0, n_iter=300).fit_transform(X)
X_transformed_tsne_precomputed = TSNE(
metric='precomputed', n_components=n_components_embedding,
random_state=0).fit_transform(dist_func(X))
random_state=0, n_iter=300).fit_transform(dist_func(X))
assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)
2 changes: 1 addition & 1 deletion sklearn/metrics/tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ def test_matthews_corrcoef_multiclass():
assert_almost_equal(mcc, 0.)


@pytest.mark.parametrize('n_points', [100, 10000, 1000000])
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The last case was taking 6s on my laptop, which is too much for a test, and doesn't seem to add much compared to the other two.

@pytest.mark.parametrize('n_points', [100, 10000])
def test_matthews_corrcoef_overflow(n_points):
# https://github.com/scikit-learn/scikit-learn/issues/9622
rng = np.random.RandomState(20170906)
Expand Down
8 changes: 4 additions & 4 deletions sklearn/metrics/tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1197,11 +1197,11 @@ def test_multiclass_sample_weight_invariance(name):
def test_multilabel_sample_weight_invariance(name):
# multilabel indicator
random_state = check_random_state(0)
_, ya = make_multilabel_classification(n_features=1, n_classes=20,
random_state=0, n_samples=100,
_, ya = make_multilabel_classification(n_features=1, n_classes=10,
random_state=0, n_samples=50,
allow_unlabeled=False)
_, yb = make_multilabel_classification(n_features=1, n_classes=20,
random_state=1, n_samples=100,
_, yb = make_multilabel_classification(n_features=1, n_classes=10,
random_state=1, n_samples=50,
allow_unlabeled=False)
y_true = np.vstack([ya, yb])
y_pred = np.vstack([ya, ya])
Expand Down
10 changes: 5 additions & 5 deletions sklearn/metrics/tests/test_pairwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ def test_pairwise_distances_chunked():
# Test the pairwise_distance helper function.
rng = np.random.RandomState(0)
# Euclidean distance should be equivalent to calling the function.
X = rng.random_sample((400, 4))
X = rng.random_sample((200, 4))
check_pairwise_distances_chunked(X, None, working_memory=1,
metric='euclidean')
# Test small amounts of memory
Expand All @@ -569,7 +569,7 @@ def test_pairwise_distances_chunked():
check_pairwise_distances_chunked(X.tolist(), None, working_memory=1,
metric='euclidean')
# Euclidean distance, with Y != X.
Y = rng.random_sample((200, 4))
Y = rng.random_sample((100, 4))
check_pairwise_distances_chunked(X, Y, working_memory=1,
metric='euclidean')
check_pairwise_distances_chunked(X.tolist(), Y.tolist(), working_memory=1,
Expand Down Expand Up @@ -1103,9 +1103,9 @@ def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
y_is_x):
# check that pairwise_distances give the same result in sequential and
# parallel, when metric has data-derived parameters.
with config_context(working_memory=1): # to have more than 1 chunk
with config_context(working_memory=0.1): # to have more than 1 chunk
rng = np.random.RandomState(0)
X = rng.random_sample((1000, 10))
X = rng.random_sample((100, 10))

if y_is_x:
Y = X
Expand All @@ -1115,7 +1115,7 @@ def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function,
else:
params = {'VI': np.linalg.inv(np.cov(X.T)).T}
else:
Y = rng.random_sample((1000, 10))
Y = rng.random_sample((100, 10))
expected_dist_default_params = cdist(X, Y, metric=metric)
if metric == "seuclidean":
params = {'V': np.var(np.vstack([X, Y]), axis=0, ddof=1)}
Expand Down
Loading