scikit-learn · thomasjpfan · Mar 2, 2020 · Jan 31, 2020 · Jan 31, 2020 · Jan 31, 2020
diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh
@@ -30,10 +30,9 @@ run_tests() {
     cp setup.cfg $TEST_DIR
     cd $TEST_DIR
 
-    # Skip tests that require large downloads over the network to save bandwidth
-    # usage as travis workers are stateless and therefore traditional local
-    # disk caching does not work.
-    export SKLEARN_SKIP_NETWORK_TESTS=1
+    # Tests that require large downloads over the networks are skipped in CI.
+    # Here we make sure, that they are still run on a regular basis.
+    export SKLEARN_SKIP_NETWORK_TESTS=0
 
     if [[ "$COVERAGE" == "true" ]]; then
         TEST_CMD="$TEST_CMD --cov sklearn"

diff --git a/sklearn/datasets/tests/conftest.py b/sklearn/datasets/tests/conftest.py
@@ -0,0 +1,61 @@
+""" Network tests are only run, if data is already locally available,
+or if download is specifically requested by environment variable."""
+from os import environ
+import pytest
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.datasets import fetch_20newsgroups_vectorized
+from sklearn.datasets import fetch_california_housing
+from sklearn.datasets import fetch_covtype
+from sklearn.datasets import fetch_kddcup99
+from sklearn.datasets import fetch_olivetti_faces
+from sklearn.datasets import fetch_rcv1
+
+
+def _wrapped_fetch(f, dataset_name):
+    """ Fetch dataset (download if missing and requested by environment) """
+    download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'
+
+    def wrapped(*args, **kwargs):
+        kwargs['download_if_missing'] = download_if_missing
+        try:
+            return f(*args, **kwargs)
+        except IOError:
+            pytest.skip("Download {} to run this test".format(dataset_name))
+    return wrapped
+
+
+@pytest.fixture
+def fetch_20newsgroups_fxt():
+    return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups')
+
+
+@pytest.fixture
+def fetch_20newsgroups_vectorized_fxt():
+    return _wrapped_fetch(fetch_20newsgroups_vectorized,
+                          dataset_name='20newsgroups_vectorized')
+
+
+@pytest.fixture
+def fetch_california_housing_fxt():
+    return _wrapped_fetch(fetch_california_housing,
+                          dataset_name='california_housing')
+
+
+@pytest.fixture
+def fetch_covtype_fxt():
+    return _wrapped_fetch(fetch_covtype, dataset_name='covtype')
+
+
+@pytest.fixture
+def fetch_kddcup99_fxt():
+    return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99')
+
+
+@pytest.fixture
+def fetch_olivetti_faces_fxt():
+    return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces')
+
+
+@pytest.fixture
+def fetch_rcv1_fxt():
+    return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1')
diff --git a/sklearn/datasets/tests/test_20news.py b/sklearn/datasets/tests/test_20news.py
@@ -1,25 +1,21 @@
-"""Test the 20news downloader, if the data is available."""
+"""Test the 20news downloader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job)."""
 from functools import partial
 
 import numpy as np
 import scipy.sparse as sp
 
-from sklearn.utils._testing import SkipTest, assert_allclose_dense_sparse
+from sklearn.utils._testing import assert_allclose_dense_sparse
 from sklearn.datasets.tests.test_common import check_return_X_y
-
-from sklearn import datasets
 from sklearn.preprocessing import normalize
 
 
-def test_20news():
-    try:
-        data = datasets.fetch_20newsgroups(
-            subset='all', download_if_missing=False, shuffle=False)
-    except IOError:
-        raise SkipTest("Download 20 newsgroups to run this test")
+def test_20news(fetch_20newsgroups_fxt):
+    data = fetch_20newsgroups_fxt(subset='all', shuffle=False)
 
     # Extract a reduced dataset
-    data2cats = datasets.fetch_20newsgroups(
+    data2cats = fetch_20newsgroups_fxt(
         subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
     # Check that the ordering of the target_names is the same
     # as the ordering in the full dataset
@@ -40,72 +36,53 @@ def test_20news():
     assert entry1 == entry2
 
     # check that return_X_y option
-    X, y = datasets.fetch_20newsgroups(
-        subset='all', shuffle=False, return_X_y=True
-    )
+    X, y = fetch_20newsgroups_fxt(subset='all', shuffle=False, return_X_y=True)
     assert len(X) == len(data.data)
     assert y.shape == data.target.shape
 
 
-def test_20news_length_consistency():
+def test_20news_length_consistency(fetch_20newsgroups_fxt):
     """Checks the length consistencies within the bunch
 
     This is a non-regression test for a bug present in 0.16.1.
     """
-    try:
-        data = datasets.fetch_20newsgroups(
-            subset='all', download_if_missing=False, shuffle=False)
-    except IOError:
-        raise SkipTest("Download 20 newsgroups to run this test")
     # Extract the full dataset
-    data = datasets.fetch_20newsgroups(subset='all')
+    data = fetch_20newsgroups_fxt(subset='all')
     assert len(data['data']) == len(data.data)
     assert len(data['target']) == len(data.target)
     assert len(data['filenames']) == len(data.filenames)
 
 
-def test_20news_vectorized():
-    try:
-        datasets.fetch_20newsgroups(subset='all',
-                                    download_if_missing=False)
-    except IOError:
-        raise SkipTest("Download 20 newsgroups to run this test")
-
+def test_20news_vectorized(fetch_20newsgroups_vectorized_fxt):
     # test subset = train
-    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
+    bunch = fetch_20newsgroups_vectorized_fxt(subset="train")
     assert sp.isspmatrix_csr(bunch.data)
     assert bunch.data.shape == (11314, 130107)
     assert bunch.target.shape[0] == 11314
     assert bunch.data.dtype == np.float64
 
     # test subset = test
-    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
+    bunch = fetch_20newsgroups_vectorized_fxt(subset="test")
     assert sp.isspmatrix_csr(bunch.data)
     assert bunch.data.shape == (7532, 130107)
     assert bunch.target.shape[0] == 7532
     assert bunch.data.dtype == np.float64
 
     # test return_X_y option
-    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
+    fetch_func = partial(fetch_20newsgroups_vectorized_fxt, subset='test')
     check_return_X_y(bunch, fetch_func)
 
     # test subset = all
-    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
+    bunch = fetch_20newsgroups_vectorized_fxt(subset='all')
     assert sp.isspmatrix_csr(bunch.data)
     assert bunch.data.shape == (11314 + 7532, 130107)
     assert bunch.target.shape[0] == 11314 + 7532
     assert bunch.data.dtype == np.float64
 
 
-def test_20news_normalization():
-    try:
-        X = datasets.fetch_20newsgroups_vectorized(normalize=False,
-                                                   download_if_missing=False)
-        X_ = datasets.fetch_20newsgroups_vectorized(normalize=True,
-                                                    download_if_missing=False)
-    except IOError:
-        raise SkipTest("Download 20 newsgroups to run this test")
-
+def test_20news_normalization(fetch_20newsgroups_vectorized_fxt):
+    X = fetch_20newsgroups_vectorized_fxt(normalize=False)
+    X_ = fetch_20newsgroups_vectorized_fxt(normalize=True)
     X_norm = X_['data'][:100]
     X = X['data'][:100]
 

diff --git a/sklearn/datasets/tests/test_california_housing.py b/sklearn/datasets/tests/test_california_housing.py
@@ -1,60 +1,33 @@
-"""Test the california_housing loader.
-
-Skipped if california_housing is not already downloaded to data_home.
-"""
-
+"""Test the california_housing loader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job)."""
 import pytest
 
-from sklearn.datasets import fetch_california_housing
 from sklearn.datasets.tests.test_common import check_return_X_y
 from functools import partial
 
 
-def fetch(*args, **kwargs):
-    return fetch_california_housing(*args, download_if_missing=False, **kwargs)
-
-
-def _is_california_housing_dataset_not_available():
-    try:
-        fetch_california_housing(download_if_missing=False)
-        return False
-    except IOError:
-        return True
-
-
-@pytest.mark.skipif(
-    _is_california_housing_dataset_not_available(),
-    reason='Download California Housing dataset to run this test'
-)
-def test_fetch():
-    data = fetch()
+def test_fetch(fetch_california_housing_fxt):
+    data = fetch_california_housing_fxt()
     assert((20640, 8) == data.data.shape)
     assert((20640, ) == data.target.shape)
 
     # test return_X_y option
-    fetch_func = partial(fetch)
+    fetch_func = partial(fetch_california_housing_fxt)
     check_return_X_y(data, fetch_func)
 
 
-@pytest.mark.skipif(
-    _is_california_housing_dataset_not_available(),
-    reason='Download California Housing dataset to run this test'
-)
-def test_fetch_asframe():
+def test_fetch_asframe(fetch_california_housing_fxt):
     pd = pytest.importorskip('pandas')
-    bunch = fetch(as_frame=True)
+    bunch = fetch_california_housing_fxt(as_frame=True)
     frame = bunch.frame
     assert hasattr(bunch, 'frame') is True
     assert frame.shape == (20640, 9)
     assert isinstance(bunch.data, pd.DataFrame)
     assert isinstance(bunch.target, pd.Series)
 
 
-@pytest.mark.skipif(
-    _is_california_housing_dataset_not_available(),
-    reason='Download California Housing dataset to run this test'
-)
-def test_pandas_dependency_message():
+def test_pandas_dependency_message(fetch_california_housing_fxt):
     try:
         import pandas  # noqa
         pytest.skip("This test requires pandas to be not installed")
@@ -64,4 +37,4 @@ def test_pandas_dependency_message():
         expected_msg = ('fetch_california_housing with as_frame=True'
                         ' requires pandas')
         with pytest.raises(ImportError, match=expected_msg):
-            fetch_california_housing(as_frame=True)
+            fetch_california_housing_fxt(as_frame=True)
diff --git a/sklearn/datasets/tests/test_covtype.py b/sklearn/datasets/tests/test_covtype.py
@@ -1,25 +1,14 @@
-"""Test the covtype loader.
+"""Test the covtype loader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job)."""
 
-Skipped if covtype is not already downloaded to data_home.
-"""
-
-from sklearn.datasets import fetch_covtype
-from sklearn.utils._testing import SkipTest
 from sklearn.datasets.tests.test_common import check_return_X_y
 from functools import partial
 
 
-def fetch(*args, **kwargs):
-    return fetch_covtype(*args, download_if_missing=False, **kwargs)
-
-
-def test_fetch():
-    try:
-        data1 = fetch(shuffle=True, random_state=42)
-    except IOError:
-        raise SkipTest("Covertype dataset can not be loaded.")
-
-    data2 = fetch(shuffle=True, random_state=37)
+def test_fetch(fetch_covtype_fxt):
+    data1 = fetch_covtype_fxt(shuffle=True, random_state=42)
+    data2 = fetch_covtype_fxt(shuffle=True, random_state=37)
 
     X1, X2 = data1['data'], data2['data']
     assert (581012, 54) == X1.shape
@@ -32,5 +21,5 @@ def test_fetch():
     assert (X1.shape[0],) == y2.shape
 
     # test return_X_y option
-    fetch_func = partial(fetch)
+    fetch_func = partial(fetch_covtype_fxt)
     check_return_X_y(data1, fetch_func)
diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py
@@ -1,55 +1,46 @@
-"""Test  kddcup99 loader. Only 'percent10' mode is tested, as the full data
-is too big to use in unit-testing.
+"""Test  kddcup99 loader, if the data is available,
+or if specifically requested via environment variable
+(e.g. for travis cron job).
 
-The test is skipped if the data wasn't previously fetched and saved to
-scikit-learn data folder.
+Only 'percent10' mode is tested, as the full data
+is too big to use in unit-testing.
 """
 
-from sklearn.datasets import fetch_kddcup99
 from sklearn.datasets.tests.test_common import check_return_X_y
-from sklearn.utils._testing import SkipTest
 from functools import partial
 
 
-
-def test_percent10():
-    try:
-        data = fetch_kddcup99(download_if_missing=False)
-    except IOError:
-        raise SkipTest("kddcup99 dataset can not be loaded.")
+def test_percent10(fetch_kddcup99_fxt):
+    data = fetch_kddcup99_fxt()
 
     assert data.data.shape == (494021, 41)
     assert data.target.shape == (494021,)
 
-    data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
+    data_shuffled = fetch_kddcup99_fxt(shuffle=True, random_state=0)
     assert data.data.shape == data_shuffled.data.shape
     assert data.target.shape == data_shuffled.target.shape
 
-    data = fetch_kddcup99('SA')
+    data = fetch_kddcup99_fxt('SA')
     assert data.data.shape == (100655, 41)
     assert data.target.shape == (100655,)
 
-    data = fetch_kddcup99('SF')
+    data = fetch_kddcup99_fxt('SF')
     assert data.data.shape == (73237, 4)
     assert data.target.shape == (73237,)
 
-    data = fetch_kddcup99('http')
+    data = fetch_kddcup99_fxt('http')
     assert data.data.shape == (58725, 3)
     assert data.target.shape == (58725,)
 
-    data = fetch_kddcup99('smtp')
+    data = fetch_kddcup99_fxt('smtp')
     assert data.data.shape == (9571, 3)
     assert data.target.shape == (9571,)
 
-    fetch_func = partial(fetch_kddcup99, 'smtp')
+    fetch_func = partial(fetch_kddcup99_fxt, 'smtp')
     check_return_X_y(data, fetch_func)
 
 
-def test_shuffle():
-    try:
-        dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True,
-                                 percent10=True, download_if_missing=False)
-    except IOError:
-        raise SkipTest("kddcup99 dataset can not be loaded.")
-
+def test_shuffle(fetch_kddcup99_fxt):
+    dataset = fetch_kddcup99_fxt(random_state=0, subset='SA', shuffle=True,
+                                 percent10=True)
     assert(any(dataset.target[-100:] == b'normal.'))