Skip to content

[MRG] Download and test datasets in cron job #16348

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Mar 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions build_tools/travis/test_script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ run_tests() {
cp setup.cfg $TEST_DIR
cd $TEST_DIR

# Skip tests that require large downloads over the network to save bandwidth
# usage as travis workers are stateless and therefore traditional local
# disk caching does not work.
export SKLEARN_SKIP_NETWORK_TESTS=1
# Tests that require large downloads over the networks are skipped in CI.
# Here we make sure, that they are still run on a regular basis.
export SKLEARN_SKIP_NETWORK_TESTS=0

if [[ "$COVERAGE" == "true" ]]; then
TEST_CMD="$TEST_CMD --cov sklearn"
Expand Down
61 changes: 61 additions & 0 deletions sklearn/datasets/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
""" Network tests are only run, if data is already locally available,
or if download is specifically requested by environment variable."""
from os import environ
import pytest
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import fetch_covtype
from sklearn.datasets import fetch_kddcup99
from sklearn.datasets import fetch_olivetti_faces
from sklearn.datasets import fetch_rcv1


def _wrapped_fetch(f, dataset_name):
""" Fetch dataset (download if missing and requested by environment) """
download_if_missing = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'

def wrapped(*args, **kwargs):
kwargs['download_if_missing'] = download_if_missing
try:
return f(*args, **kwargs)
except IOError:
pytest.skip("Download {} to run this test".format(dataset_name))
return wrapped


@pytest.fixture
def fetch_20newsgroups_fxt():
return _wrapped_fetch(fetch_20newsgroups, dataset_name='20newsgroups')


@pytest.fixture
def fetch_20newsgroups_vectorized_fxt():
return _wrapped_fetch(fetch_20newsgroups_vectorized,
dataset_name='20newsgroups_vectorized')


@pytest.fixture
def fetch_california_housing_fxt():
return _wrapped_fetch(fetch_california_housing,
dataset_name='california_housing')


@pytest.fixture
def fetch_covtype_fxt():
return _wrapped_fetch(fetch_covtype, dataset_name='covtype')


@pytest.fixture
def fetch_kddcup99_fxt():
return _wrapped_fetch(fetch_kddcup99, dataset_name='kddcup99')


@pytest.fixture
def fetch_olivetti_faces_fxt():
return _wrapped_fetch(fetch_olivetti_faces, dataset_name='olivetti_faces')


@pytest.fixture
def fetch_rcv1_fxt():
return _wrapped_fetch(fetch_rcv1, dataset_name='rcv1')
59 changes: 18 additions & 41 deletions sklearn/datasets/tests/test_20news.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,21 @@
"""Test the 20news downloader, if the data is available."""
"""Test the 20news downloader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""
from functools import partial

import numpy as np
import scipy.sparse as sp

from sklearn.utils._testing import SkipTest, assert_allclose_dense_sparse
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn.datasets.tests.test_common import check_return_X_y

from sklearn import datasets
from sklearn.preprocessing import normalize


def test_20news():
try:
data = datasets.fetch_20newsgroups(
subset='all', download_if_missing=False, shuffle=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")
def test_20news(fetch_20newsgroups_fxt):
data = fetch_20newsgroups_fxt(subset='all', shuffle=False)

# Extract a reduced dataset
data2cats = datasets.fetch_20newsgroups(
data2cats = fetch_20newsgroups_fxt(
subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
# Check that the ordering of the target_names is the same
# as the ordering in the full dataset
Expand All @@ -40,72 +36,53 @@ def test_20news():
assert entry1 == entry2

# check that return_X_y option
X, y = datasets.fetch_20newsgroups(
subset='all', shuffle=False, return_X_y=True
)
X, y = fetch_20newsgroups_fxt(subset='all', shuffle=False, return_X_y=True)
assert len(X) == len(data.data)
assert y.shape == data.target.shape


def test_20news_length_consistency():
def test_20news_length_consistency(fetch_20newsgroups_fxt):
"""Checks the length consistencies within the bunch

This is a non-regression test for a bug present in 0.16.1.
"""
try:
data = datasets.fetch_20newsgroups(
subset='all', download_if_missing=False, shuffle=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")
# Extract the full dataset
data = datasets.fetch_20newsgroups(subset='all')
data = fetch_20newsgroups_fxt(subset='all')
assert len(data['data']) == len(data.data)
assert len(data['target']) == len(data.target)
assert len(data['filenames']) == len(data.filenames)


def test_20news_vectorized():
try:
datasets.fetch_20newsgroups(subset='all',
download_if_missing=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")

def test_20news_vectorized(fetch_20newsgroups_vectorized_fxt):
# test subset = train
bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
bunch = fetch_20newsgroups_vectorized_fxt(subset="train")
assert sp.isspmatrix_csr(bunch.data)
assert bunch.data.shape == (11314, 130107)
assert bunch.target.shape[0] == 11314
assert bunch.data.dtype == np.float64

# test subset = test
bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
bunch = fetch_20newsgroups_vectorized_fxt(subset="test")
assert sp.isspmatrix_csr(bunch.data)
assert bunch.data.shape == (7532, 130107)
assert bunch.target.shape[0] == 7532
assert bunch.data.dtype == np.float64

# test return_X_y option
fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
fetch_func = partial(fetch_20newsgroups_vectorized_fxt, subset='test')
check_return_X_y(bunch, fetch_func)

# test subset = all
bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
bunch = fetch_20newsgroups_vectorized_fxt(subset='all')
assert sp.isspmatrix_csr(bunch.data)
assert bunch.data.shape == (11314 + 7532, 130107)
assert bunch.target.shape[0] == 11314 + 7532
assert bunch.data.dtype == np.float64


def test_20news_normalization():
try:
X = datasets.fetch_20newsgroups_vectorized(normalize=False,
download_if_missing=False)
X_ = datasets.fetch_20newsgroups_vectorized(normalize=True,
download_if_missing=False)
except IOError:
raise SkipTest("Download 20 newsgroups to run this test")

def test_20news_normalization(fetch_20newsgroups_vectorized_fxt):
X = fetch_20newsgroups_vectorized_fxt(normalize=False)
X_ = fetch_20newsgroups_vectorized_fxt(normalize=True)
X_norm = X_['data'][:100]
X = X['data'][:100]

Expand Down
47 changes: 10 additions & 37 deletions sklearn/datasets/tests/test_california_housing.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,33 @@
"""Test the california_housing loader.

Skipped if california_housing is not already downloaded to data_home.
"""

"""Test the california_housing loader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""
import pytest

from sklearn.datasets import fetch_california_housing
from sklearn.datasets.tests.test_common import check_return_X_y
from functools import partial


def fetch(*args, **kwargs):
return fetch_california_housing(*args, download_if_missing=False, **kwargs)


def _is_california_housing_dataset_not_available():
try:
fetch_california_housing(download_if_missing=False)
return False
except IOError:
return True


@pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_fetch():
data = fetch()
def test_fetch(fetch_california_housing_fxt):
data = fetch_california_housing_fxt()
assert((20640, 8) == data.data.shape)
assert((20640, ) == data.target.shape)

# test return_X_y option
fetch_func = partial(fetch)
fetch_func = partial(fetch_california_housing_fxt)
check_return_X_y(data, fetch_func)


@pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_fetch_asframe():
def test_fetch_asframe(fetch_california_housing_fxt):
pd = pytest.importorskip('pandas')
bunch = fetch(as_frame=True)
bunch = fetch_california_housing_fxt(as_frame=True)
frame = bunch.frame
assert hasattr(bunch, 'frame') is True
assert frame.shape == (20640, 9)
assert isinstance(bunch.data, pd.DataFrame)
assert isinstance(bunch.target, pd.Series)


@pytest.mark.skipif(
_is_california_housing_dataset_not_available(),
reason='Download California Housing dataset to run this test'
)
def test_pandas_dependency_message():
def test_pandas_dependency_message(fetch_california_housing_fxt):
try:
import pandas # noqa
pytest.skip("This test requires pandas to be not installed")
Expand All @@ -64,4 +37,4 @@ def test_pandas_dependency_message():
expected_msg = ('fetch_california_housing with as_frame=True'
' requires pandas')
with pytest.raises(ImportError, match=expected_msg):
fetch_california_housing(as_frame=True)
fetch_california_housing_fxt(as_frame=True)
25 changes: 7 additions & 18 deletions sklearn/datasets/tests/test_covtype.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,14 @@
"""Test the covtype loader.
"""Test the covtype loader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job)."""

Skipped if covtype is not already downloaded to data_home.
"""

from sklearn.datasets import fetch_covtype
from sklearn.utils._testing import SkipTest
from sklearn.datasets.tests.test_common import check_return_X_y
from functools import partial


def fetch(*args, **kwargs):
return fetch_covtype(*args, download_if_missing=False, **kwargs)


def test_fetch():
try:
data1 = fetch(shuffle=True, random_state=42)
except IOError:
raise SkipTest("Covertype dataset can not be loaded.")

data2 = fetch(shuffle=True, random_state=37)
def test_fetch(fetch_covtype_fxt):
data1 = fetch_covtype_fxt(shuffle=True, random_state=42)
data2 = fetch_covtype_fxt(shuffle=True, random_state=37)

X1, X2 = data1['data'], data2['data']
assert (581012, 54) == X1.shape
Expand All @@ -32,5 +21,5 @@ def test_fetch():
assert (X1.shape[0],) == y2.shape

# test return_X_y option
fetch_func = partial(fetch)
fetch_func = partial(fetch_covtype_fxt)
check_return_X_y(data1, fetch_func)
41 changes: 16 additions & 25 deletions sklearn/datasets/tests/test_kddcup99.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,46 @@
"""Test kddcup99 loader. Only 'percent10' mode is tested, as the full data
is too big to use in unit-testing.
"""Test kddcup99 loader, if the data is available,
or if specifically requested via environment variable
(e.g. for travis cron job).

The test is skipped if the data wasn't previously fetched and saved to
scikit-learn data folder.
Only 'percent10' mode is tested, as the full data
is too big to use in unit-testing.
"""

from sklearn.datasets import fetch_kddcup99
from sklearn.datasets.tests.test_common import check_return_X_y
from sklearn.utils._testing import SkipTest
from functools import partial



def test_percent10():
try:
data = fetch_kddcup99(download_if_missing=False)
except IOError:
raise SkipTest("kddcup99 dataset can not be loaded.")
def test_percent10(fetch_kddcup99_fxt):
data = fetch_kddcup99_fxt()

assert data.data.shape == (494021, 41)
assert data.target.shape == (494021,)

data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
data_shuffled = fetch_kddcup99_fxt(shuffle=True, random_state=0)
assert data.data.shape == data_shuffled.data.shape
assert data.target.shape == data_shuffled.target.shape

data = fetch_kddcup99('SA')
data = fetch_kddcup99_fxt('SA')
assert data.data.shape == (100655, 41)
assert data.target.shape == (100655,)

data = fetch_kddcup99('SF')
data = fetch_kddcup99_fxt('SF')
assert data.data.shape == (73237, 4)
assert data.target.shape == (73237,)

data = fetch_kddcup99('http')
data = fetch_kddcup99_fxt('http')
assert data.data.shape == (58725, 3)
assert data.target.shape == (58725,)

data = fetch_kddcup99('smtp')
data = fetch_kddcup99_fxt('smtp')
assert data.data.shape == (9571, 3)
assert data.target.shape == (9571,)

fetch_func = partial(fetch_kddcup99, 'smtp')
fetch_func = partial(fetch_kddcup99_fxt, 'smtp')
check_return_X_y(data, fetch_func)


def test_shuffle():
try:
dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True,
percent10=True, download_if_missing=False)
except IOError:
raise SkipTest("kddcup99 dataset can not be loaded.")

def test_shuffle(fetch_kddcup99_fxt):
dataset = fetch_kddcup99_fxt(random_state=0, subset='SA', shuffle=True,
percent10=True)
assert(any(dataset.target[-100:] == b'normal.'))
Loading