Skip to content

TST use global_random_seed in sklearn/ensemble/tests/test_iforest.py #22901

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
May 18, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 36 additions & 45 deletions sklearn/ensemble/tests/test_iforest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,24 +27,13 @@
from scipy.sparse import csc_matrix, csr_matrix
from unittest.mock import Mock, patch

rng = check_random_state(0)

# load the iris dataset
# and randomly permute it
# load iris & diabetes dataset
iris = load_iris()
perm = rng.permutation(iris.target.size)
iris.data = iris.data[perm]
iris.target = iris.target[perm]

# also load the diabetes dataset
# and randomly permute it
diabetes = load_diabetes()
perm = rng.permutation(diabetes.target.size)
diabetes.data = diabetes.data[perm]
diabetes.target = diabetes.target[perm]


def test_iforest():
def test_iforest(global_random_seed):
"""Check Isolation Forest for various parameter settings."""
X_train = np.array([[0, 1], [1, 2]])
X_test = np.array([[2, 1], [1, 1]])
Expand All @@ -55,15 +44,15 @@ def test_iforest():

with ignore_warnings():
for params in grid:
IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test)
IsolationForest(random_state=global_random_seed, **params).fit(
X_train
).predict(X_test)


def test_iforest_sparse():
def test_iforest_sparse(global_random_seed):
"""Check IForest for various parameter settings on sparse input."""
rng = check_random_state(0)
X_train, X_test, y_train, y_test = train_test_split(
diabetes.data[:50], diabetes.target[:50], random_state=rng
)
rng = check_random_state(global_random_seed)
X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng)
grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})

for sparse_format in [csc_matrix, csr_matrix]:
Expand All @@ -73,13 +62,13 @@ def test_iforest_sparse():
for params in grid:
# Trained on sparse format
sparse_classifier = IsolationForest(
n_estimators=10, random_state=1, **params
n_estimators=10, random_state=global_random_seed, **params
).fit(X_train_sparse)
sparse_results = sparse_classifier.predict(X_test_sparse)

# Trained on dense format
dense_classifier = IsolationForest(
n_estimators=10, random_state=1, **params
n_estimators=10, random_state=global_random_seed, **params
).fit(X_train)
dense_results = dense_classifier.predict(X_test)

Expand Down Expand Up @@ -143,41 +132,39 @@ def test_max_samples_attribute():
assert clf.max_samples_ == 0.4 * X.shape[0]


def test_iforest_parallel_regression():
def test_iforest_parallel_regression(global_random_seed):
"""Check parallel regression."""
rng = check_random_state(0)
rng = check_random_state(global_random_seed)

X_train, X_test, y_train, y_test = train_test_split(
diabetes.data, diabetes.target, random_state=rng
)
X_train, X_test = train_test_split(diabetes.data, random_state=rng)

ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train)
ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)

ensemble.set_params(n_jobs=1)
y1 = ensemble.predict(X_test)
ensemble.set_params(n_jobs=2)
y2 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y2)

ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train)
ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train)

y3 = ensemble.predict(X_test)
assert_array_almost_equal(y1, y3)


def test_iforest_performance():
def test_iforest_performance(global_random_seed):
"""Test Isolation Forest performs well"""

# Generate train/test data
rng = check_random_state(2)
X = 0.3 * rng.randn(120, 2)
X_train = np.r_[X + 2, X - 2]
X_train = X[:100]
rng = check_random_state(global_random_seed)
X = 0.3 * rng.randn(600, 2)
X = rng.permutation(np.vstack((X + 2, X - 2)))
X_train = X[:1000]

# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
X_test = np.r_[X[100:], X_outliers]
y_test = np.array([0] * 20 + [1] * 20)
X_outliers = rng.uniform(low=-1, high=1, size=(200, 2))
X_test = np.vstack((X[1000:], X_outliers))
y_test = np.array([0] * 200 + [1] * 200)

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
Expand All @@ -190,12 +177,12 @@ def test_iforest_performance():


@pytest.mark.parametrize("contamination", [0.25, "auto"])
def test_iforest_works(contamination):
def test_iforest_works(contamination, global_random_seed):
# toy sample (the last two samples are outliers)
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]

# Test IsolationForest
clf = IsolationForest(random_state=rng, contamination=contamination)
clf = IsolationForest(random_state=global_random_seed, contamination=contamination)
clf.fit(X)
decision_func = -clf.decision_function(X)
pred = clf.predict(X)
Expand Down Expand Up @@ -281,25 +268,29 @@ def test_iforest_warm_start():


# mock get_chunk_n_rows to actually test more than one chunk (here one
# chunk = 3 rows:
# chunk has 3 rows):
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 3}),
)
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
def test_iforest_chunks_works1(mocked_get_chunk, contamination, n_predict_calls):
test_iforest_works(contamination)
def test_iforest_chunks_works1(
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
):
test_iforest_works(contamination, global_random_seed)
assert mocked_get_chunk.call_count == n_predict_calls


# idem with chunk_size = 5 rows
# idem with chunk_size = 10 rows
@patch(
"sklearn.ensemble._iforest.get_chunk_n_rows",
side_effect=Mock(**{"return_value": 10}),
)
@pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls):
test_iforest_works(contamination)
def test_iforest_chunks_works2(
mocked_get_chunk, contamination, n_predict_calls, global_random_seed
):
test_iforest_works(contamination, global_random_seed)
assert mocked_get_chunk.call_count == n_predict_calls


Expand Down