From 29d1282a05c6621c6836943d9efd339a396300e8 Mon Sep 17 00:00:00 2001 From: Maxwell Date: Sat, 19 Mar 2022 16:50:20 +0800 Subject: [PATCH 01/13] add global-random-seed in tests --- sklearn/ensemble/tests/test_iforest.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 55046527b13c3..33b7c55a34742 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -58,9 +58,9 @@ def test_iforest(): IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test) -def test_iforest_sparse(): +def test_iforest_sparse(global_random_seed): """Check IForest for various parameter settings on sparse input.""" - rng = check_random_state(0) + rng = check_random_state(global_random_seed) X_train, X_test, y_train, y_test = train_test_split( diabetes.data[:50], diabetes.target[:50], random_state=rng ) @@ -143,9 +143,9 @@ def test_max_samples_attribute(): assert clf.max_samples_ == 0.4 * X.shape[0] -def test_iforest_parallel_regression(): +def test_iforest_parallel_regression(global_random_seed): """Check parallel regression.""" - rng = check_random_state(0) + rng = check_random_state(global_random_seed) X_train, X_test, y_train, y_test = train_test_split( diabetes.data, diabetes.target, random_state=rng @@ -165,19 +165,19 @@ def test_iforest_parallel_regression(): assert_array_almost_equal(y1, y3) -def test_iforest_performance(): +def test_iforest_performance(global_random_seed): """Test Isolation Forest performs well""" # Generate train/test data - rng = check_random_state(2) - X = 0.3 * rng.randn(120, 2) + rng = check_random_state(global_random_seed) + X = 0.3 * rng.randn(1200, 2) X_train = np.r_[X + 2, X - 2] - X_train = X[:100] + X_train = X[:1000] # Generate some abnormal novel observations - X_outliers = rng.uniform(low=-4, high=4, size=(20, 2)) - X_test = np.r_[X[100:], X_outliers] - y_test = np.array([0] * 20 + [1] * 20) + X_outliers = rng.uniform(low=-16, high=16, size=(200, 2)) + X_test = np.r_[X[1000:], X_outliers] + y_test = np.array([0] * 200 + [1] * 200) # fit the model clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train) @@ -303,7 +303,7 @@ def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls) assert mocked_get_chunk.call_count == n_predict_calls -def test_iforest_with_uniform_data(): +def test_iforest_with_uniform_data(global_random_seed): """Test whether iforest predicts inliers when using uniform data""" # 2-d array of all 1s @@ -311,7 +311,7 @@ def test_iforest_with_uniform_data(): iforest = IsolationForest() iforest.fit(X) - rng = np.random.RandomState(0) + rng = np.random.RandomState(global_random_seed) assert all(iforest.predict(X) == 1) assert all(iforest.predict(rng.randn(100, 10)) == 1) From d81556cef8d9128fa1d003012ed571c370c97393 Mon Sep 17 00:00:00 2001 From: Maxwell Date: Mon, 28 Mar 2022 19:54:42 +0800 Subject: [PATCH 02/13] remove global_random_seed for test_iforest_with_uniform_data --- sklearn/ensemble/tests/test_iforest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 33b7c55a34742..3eb3c39183876 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -303,7 +303,7 @@ def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls) assert mocked_get_chunk.call_count == n_predict_calls -def test_iforest_with_uniform_data(global_random_seed): +def test_iforest_with_uniform_data(): """Test whether iforest predicts inliers when using uniform data""" # 2-d array of all 1s @@ -311,7 +311,7 @@ def test_iforest_with_uniform_data(global_random_seed): iforest = IsolationForest() iforest.fit(X) - rng = np.random.RandomState(global_random_seed) + rng = np.random.RandomState(0) assert all(iforest.predict(X) == 1) assert all(iforest.predict(rng.randn(100, 10)) == 1) From 8c776c2e266de82ddcc01238e456a12e8f7d2b1d Mon Sep 17 00:00:00 2001 From: Maxwell Date: Mon, 28 Mar 2022 19:59:39 +0800 Subject: [PATCH 03/13] update test_iforest_performance --- sklearn/ensemble/tests/test_iforest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 3eb3c39183876..3a3b565906672 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -170,13 +170,13 @@ def test_iforest_performance(global_random_seed): # Generate train/test data rng = check_random_state(global_random_seed) - X = 0.3 * rng.randn(1200, 2) - X_train = np.r_[X + 2, X - 2] + X = 0.3 * rng.randn(600, 2) + X = rng.permutation(np.vstack((X + 2, X - 2))) X_train = X[:1000] # Generate some abnormal novel observations - X_outliers = rng.uniform(low=-16, high=16, size=(200, 2)) - X_test = np.r_[X[1000:], X_outliers] + X_outliers = rng.uniform(low=-1, high=1, size=(200, 2)) + X_test = np.vstack((X[1000:], X_outliers)) y_test = np.array([0] * 200 + [1] * 200) # fit the model From 768d4f24be32d6533ca546d2f8d75d290dd52093 Mon Sep 17 00:00:00 2001 From: jeremie du boisberranger Date: Wed, 4 May 2022 15:33:01 +0200 Subject: [PATCH 04/13] [all random seeds] test_iforest_sparse test_iforest_parallel_regression test_iforest_performance From 0b1c259b262e0671ff29a34e20920ff336aa75da Mon Sep 17 00:00:00 2001 From: Maxwell Date: Wed, 4 May 2022 22:32:31 +0800 Subject: [PATCH 05/13] use global_random_seed in iforest construction --- sklearn/ensemble/tests/test_iforest.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 3a3b565906672..b3081ec3c0a90 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -73,13 +73,13 @@ def test_iforest_sparse(global_random_seed): for params in grid: # Trained on sparse format sparse_classifier = IsolationForest( - n_estimators=10, random_state=1, **params + n_estimators=10, random_state=global_random_seed, **params ).fit(X_train_sparse) sparse_results = sparse_classifier.predict(X_test_sparse) # Trained on dense format dense_classifier = IsolationForest( - n_estimators=10, random_state=1, **params + n_estimators=10, random_state=global_random_seed, **params ).fit(X_train) dense_results = dense_classifier.predict(X_test) @@ -151,7 +151,7 @@ def test_iforest_parallel_regression(global_random_seed): diabetes.data, diabetes.target, random_state=rng ) - ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train) + ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train) ensemble.set_params(n_jobs=1) y1 = ensemble.predict(X_test) @@ -159,7 +159,7 @@ def test_iforest_parallel_regression(global_random_seed): y2 = ensemble.predict(X_test) assert_array_almost_equal(y1, y2) - ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train) + ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train) y3 = ensemble.predict(X_test) assert_array_almost_equal(y1, y3) From 3a956c1d1892f3b451106abb1cea3775575ce35e Mon Sep 17 00:00:00 2001 From: Maxwell Date: Fri, 6 May 2022 21:27:09 +0800 Subject: [PATCH 06/13] does not use global random state in test functions --- sklearn/ensemble/tests/test_iforest.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index b3081ec3c0a90..74d854e7d6b19 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -44,7 +44,7 @@ diabetes.target = diabetes.target[perm] -def test_iforest(): +def test_iforest(global_random_seed): """Check Isolation Forest for various parameter settings.""" X_train = np.array([[0, 1], [1, 2]]) X_test = np.array([[2, 1], [1, 1]]) @@ -55,7 +55,7 @@ def test_iforest(): with ignore_warnings(): for params in grid: - IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test) + IsolationForest(random_state=global_random_seed, **params).fit(X_train).predict(X_test) def test_iforest_sparse(global_random_seed): @@ -190,12 +190,12 @@ def test_iforest_performance(global_random_seed): @pytest.mark.parametrize("contamination", [0.25, "auto"]) -def test_iforest_works(contamination): +def test_iforest_works(contamination, global_random_seed): # toy sample (the last two samples are outliers) - X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]] + X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]] # Test IsolationForest - clf = IsolationForest(random_state=rng, contamination=contamination) + clf = IsolationForest(random_state=global_random_seed, contamination=contamination) clf.fit(X) decision_func = -clf.decision_function(X) pred = clf.predict(X) From 29842dd348eea8f5c902ed832e2d8e102b7483e2 Mon Sep 17 00:00:00 2001 From: Maxwell Date: Fri, 6 May 2022 21:30:11 +0800 Subject: [PATCH 07/13] clean up code in train_test_split Co-authored-by: Guillaume Lemaitre --- sklearn/ensemble/tests/test_iforest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 74d854e7d6b19..af14dbd77b379 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -61,9 +61,7 @@ def test_iforest(global_random_seed): def test_iforest_sparse(global_random_seed): """Check IForest for various parameter settings on sparse input.""" rng = check_random_state(global_random_seed) - X_train, X_test, y_train, y_test = train_test_split( - diabetes.data[:50], diabetes.target[:50], random_state=rng - ) + X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng) grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]}) for sparse_format in [csc_matrix, csr_matrix]: From feee7e7ad17760857f2feb003470bdb5bf39e918 Mon Sep 17 00:00:00 2001 From: Maxwell Date: Fri, 6 May 2022 21:30:29 +0800 Subject: [PATCH 08/13] clean up code in train_test_split Co-authored-by: Guillaume Lemaitre --- sklearn/ensemble/tests/test_iforest.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index af14dbd77b379..4721f6bd38f08 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -145,9 +145,7 @@ def test_iforest_parallel_regression(global_random_seed): """Check parallel regression.""" rng = check_random_state(global_random_seed) - X_train, X_test, y_train, y_test = train_test_split( - diabetes.data, diabetes.target, random_state=rng - ) + X_train, X_test, y_train, y_test = train_test_split(diabetes.data, random_state=rng) ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train) From e21f7f731fb404b7232ca87dac6f34e430f69bb5 Mon Sep 17 00:00:00 2001 From: Maxwell Date: Fri, 6 May 2022 21:37:59 +0800 Subject: [PATCH 09/13] clean up train_test_split & fix failing tests --- sklearn/ensemble/tests/test_iforest.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 4721f6bd38f08..1d2c45e1878ce 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -55,7 +55,9 @@ def test_iforest(global_random_seed): with ignore_warnings(): for params in grid: - IsolationForest(random_state=global_random_seed, **params).fit(X_train).predict(X_test) + IsolationForest(random_state=global_random_seed, **params).fit( + X_train + ).predict(X_test) def test_iforest_sparse(global_random_seed): @@ -145,7 +147,7 @@ def test_iforest_parallel_regression(global_random_seed): """Check parallel regression.""" rng = check_random_state(global_random_seed) - X_train, X_test, y_train, y_test = train_test_split(diabetes.data, random_state=rng) + X_train, X_test = train_test_split(diabetes.data, random_state=rng) ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train) @@ -284,7 +286,7 @@ def test_iforest_warm_start(): ) @pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]) def test_iforest_chunks_works1(mocked_get_chunk, contamination, n_predict_calls): - test_iforest_works(contamination) + test_iforest_works(contamination, global_random_seed=0) assert mocked_get_chunk.call_count == n_predict_calls @@ -295,7 +297,7 @@ def test_iforest_chunks_works1(mocked_get_chunk, contamination, n_predict_calls) ) @pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]) def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls): - test_iforest_works(contamination) + test_iforest_works(contamination, global_random_seed=0) assert mocked_get_chunk.call_count == n_predict_calls From 66fb5efa080d926c6c39e88bccc81343c194d6f4 Mon Sep 17 00:00:00 2001 From: Maxwell Date: Sun, 8 May 2022 14:55:59 +0800 Subject: [PATCH 10/13] remove global random vareiable --- sklearn/ensemble/tests/test_iforest.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 1d2c45e1878ce..436721fc7a8ee 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -27,21 +27,10 @@ from scipy.sparse import csc_matrix, csr_matrix from unittest.mock import Mock, patch -rng = check_random_state(0) -# load the iris dataset -# and randomly permute it +# load iris & diabetes dataset iris = load_iris() -perm = rng.permutation(iris.target.size) -iris.data = iris.data[perm] -iris.target = iris.target[perm] - -# also load the diabetes dataset -# and randomly permute it diabetes = load_diabetes() -perm = rng.permutation(diabetes.target.size) -diabetes.data = diabetes.data[perm] -diabetes.target = diabetes.target[perm] def test_iforest(global_random_seed): From 39c932c2c8dafb20b283a440e8e7d3488b3264aa Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 17 May 2022 10:34:10 +0200 Subject: [PATCH 11/13] Use the global_random_seed fixture also in test_iforest_chunks_works* --- sklearn/ensemble/tests/test_iforest.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py index 436721fc7a8ee..7be5f4aa80961 100644 --- a/sklearn/ensemble/tests/test_iforest.py +++ b/sklearn/ensemble/tests/test_iforest.py @@ -268,25 +268,29 @@ def test_iforest_warm_start(): # mock get_chunk_n_rows to actually test more than one chunk (here one -# chunk = 3 rows: +# chunk has 3 rows): @patch( "sklearn.ensemble._iforest.get_chunk_n_rows", side_effect=Mock(**{"return_value": 3}), ) @pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]) -def test_iforest_chunks_works1(mocked_get_chunk, contamination, n_predict_calls): - test_iforest_works(contamination, global_random_seed=0) +def test_iforest_chunks_works1( + mocked_get_chunk, contamination, n_predict_calls, global_random_seed +): + test_iforest_works(contamination, global_random_seed) assert mocked_get_chunk.call_count == n_predict_calls -# idem with chunk_size = 5 rows +# idem with chunk_size = 10 rows @patch( "sklearn.ensemble._iforest.get_chunk_n_rows", side_effect=Mock(**{"return_value": 10}), ) @pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)]) -def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls): - test_iforest_works(contamination, global_random_seed=0) +def test_iforest_chunks_works2( + mocked_get_chunk, contamination, n_predict_calls, global_random_seed +): + test_iforest_works(contamination, global_random_seed) assert mocked_get_chunk.call_count == n_predict_calls From 0fa49bc49bf05124291b11500ab73e9af367bc6c Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 17 May 2022 10:34:41 +0200 Subject: [PATCH 12/13] [all random seeds] From bca0de43cac349f73d6bc4240bb087387c4b98e9 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 17 May 2022 10:40:07 +0200 Subject: [PATCH 13/13] [all random seeds] test_iforest_sparse test_iforest_parallel_regression test_iforest_performance test_iforest_chunks_works1 test_iforest_chunks_works2