From 29d1282a05c6621c6836943d9efd339a396300e8 Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Sat, 19 Mar 2022 16:50:20 +0800
Subject: [PATCH 01/13] add global-random-seed in tests

---
 sklearn/ensemble/tests/test_iforest.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 55046527b13c3..33b7c55a34742 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -58,9 +58,9 @@ def test_iforest():
             IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test)
 
 
-def test_iforest_sparse():
+def test_iforest_sparse(global_random_seed):
     """Check IForest for various parameter settings on sparse input."""
-    rng = check_random_state(0)
+    rng = check_random_state(global_random_seed)
     X_train, X_test, y_train, y_test = train_test_split(
         diabetes.data[:50], diabetes.target[:50], random_state=rng
     )
@@ -143,9 +143,9 @@ def test_max_samples_attribute():
     assert clf.max_samples_ == 0.4 * X.shape[0]
 
 
-def test_iforest_parallel_regression():
+def test_iforest_parallel_regression(global_random_seed):
     """Check parallel regression."""
-    rng = check_random_state(0)
+    rng = check_random_state(global_random_seed)
 
     X_train, X_test, y_train, y_test = train_test_split(
         diabetes.data, diabetes.target, random_state=rng
@@ -165,19 +165,19 @@ def test_iforest_parallel_regression():
     assert_array_almost_equal(y1, y3)
 
 
-def test_iforest_performance():
+def test_iforest_performance(global_random_seed):
     """Test Isolation Forest performs well"""
 
     # Generate train/test data
-    rng = check_random_state(2)
-    X = 0.3 * rng.randn(120, 2)
+    rng = check_random_state(global_random_seed)
+    X = 0.3 * rng.randn(1200, 2)
     X_train = np.r_[X + 2, X - 2]
-    X_train = X[:100]
+    X_train = X[:1000]
 
     # Generate some abnormal novel observations
-    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
-    X_test = np.r_[X[100:], X_outliers]
-    y_test = np.array([0] * 20 + [1] * 20)
+    X_outliers = rng.uniform(low=-16, high=16, size=(200, 2))
+    X_test = np.r_[X[1000:], X_outliers]
+    y_test = np.array([0] * 200 + [1] * 200)
 
     # fit the model
     clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)
@@ -303,7 +303,7 @@ def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls)
     assert mocked_get_chunk.call_count == n_predict_calls
 
 
-def test_iforest_with_uniform_data():
+def test_iforest_with_uniform_data(global_random_seed):
     """Test whether iforest predicts inliers when using uniform data"""
 
     # 2-d array of all 1s
@@ -311,7 +311,7 @@ def test_iforest_with_uniform_data():
     iforest = IsolationForest()
     iforest.fit(X)
 
-    rng = np.random.RandomState(0)
+    rng = np.random.RandomState(global_random_seed)
 
     assert all(iforest.predict(X) == 1)
     assert all(iforest.predict(rng.randn(100, 10)) == 1)

From d81556cef8d9128fa1d003012ed571c370c97393 Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Mon, 28 Mar 2022 19:54:42 +0800
Subject: [PATCH 02/13] remove global_random_seed for
 test_iforest_with_uniform_data

---
 sklearn/ensemble/tests/test_iforest.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 33b7c55a34742..3eb3c39183876 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -303,7 +303,7 @@ def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls)
     assert mocked_get_chunk.call_count == n_predict_calls
 
 
-def test_iforest_with_uniform_data(global_random_seed):
+def test_iforest_with_uniform_data():
     """Test whether iforest predicts inliers when using uniform data"""
 
     # 2-d array of all 1s
@@ -311,7 +311,7 @@ def test_iforest_with_uniform_data(global_random_seed):
     iforest = IsolationForest()
     iforest.fit(X)
 
-    rng = np.random.RandomState(global_random_seed)
+    rng = np.random.RandomState(0)
 
     assert all(iforest.predict(X) == 1)
     assert all(iforest.predict(rng.randn(100, 10)) == 1)

From 8c776c2e266de82ddcc01238e456a12e8f7d2b1d Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Mon, 28 Mar 2022 19:59:39 +0800
Subject: [PATCH 03/13] update test_iforest_performance

---
 sklearn/ensemble/tests/test_iforest.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 3eb3c39183876..3a3b565906672 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -170,13 +170,13 @@ def test_iforest_performance(global_random_seed):
 
     # Generate train/test data
     rng = check_random_state(global_random_seed)
-    X = 0.3 * rng.randn(1200, 2)
-    X_train = np.r_[X + 2, X - 2]
+    X = 0.3 * rng.randn(600, 2)
+    X = rng.permutation(np.vstack((X + 2, X - 2)))
     X_train = X[:1000]
 
     # Generate some abnormal novel observations
-    X_outliers = rng.uniform(low=-16, high=16, size=(200, 2))
-    X_test = np.r_[X[1000:], X_outliers]
+    X_outliers = rng.uniform(low=-1, high=1, size=(200, 2))
+    X_test = np.vstack((X[1000:], X_outliers))
     y_test = np.array([0] * 200 + [1] * 200)
 
     # fit the model

From 768d4f24be32d6533ca546d2f8d75d290dd52093 Mon Sep 17 00:00:00 2001
From: jeremie du boisberranger <jeremiedbb@yahoo.fr>
Date: Wed, 4 May 2022 15:33:01 +0200
Subject: [PATCH 04/13] [all random seeds] test_iforest_sparse
 test_iforest_parallel_regression test_iforest_performance


From 0b1c259b262e0671ff29a34e20920ff336aa75da Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Wed, 4 May 2022 22:32:31 +0800
Subject: [PATCH 05/13] use global_random_seed in iforest construction

---
 sklearn/ensemble/tests/test_iforest.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 3a3b565906672..b3081ec3c0a90 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -73,13 +73,13 @@ def test_iforest_sparse(global_random_seed):
         for params in grid:
             # Trained on sparse format
             sparse_classifier = IsolationForest(
-                n_estimators=10, random_state=1, **params
+                n_estimators=10, random_state=global_random_seed, **params
             ).fit(X_train_sparse)
             sparse_results = sparse_classifier.predict(X_test_sparse)
 
             # Trained on dense format
             dense_classifier = IsolationForest(
-                n_estimators=10, random_state=1, **params
+                n_estimators=10, random_state=global_random_seed, **params
             ).fit(X_train)
             dense_results = dense_classifier.predict(X_test)
 
@@ -151,7 +151,7 @@ def test_iforest_parallel_regression(global_random_seed):
         diabetes.data, diabetes.target, random_state=rng
     )
 
-    ensemble = IsolationForest(n_jobs=3, random_state=0).fit(X_train)
+    ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
 
     ensemble.set_params(n_jobs=1)
     y1 = ensemble.predict(X_test)
@@ -159,7 +159,7 @@ def test_iforest_parallel_regression(global_random_seed):
     y2 = ensemble.predict(X_test)
     assert_array_almost_equal(y1, y2)
 
-    ensemble = IsolationForest(n_jobs=1, random_state=0).fit(X_train)
+    ensemble = IsolationForest(n_jobs=1, random_state=global_random_seed).fit(X_train)
 
     y3 = ensemble.predict(X_test)
     assert_array_almost_equal(y1, y3)

From 3a956c1d1892f3b451106abb1cea3775575ce35e Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Fri, 6 May 2022 21:27:09 +0800
Subject: [PATCH 06/13] does not use global random state in test functions

---
 sklearn/ensemble/tests/test_iforest.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index b3081ec3c0a90..74d854e7d6b19 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -44,7 +44,7 @@
 diabetes.target = diabetes.target[perm]
 
 
-def test_iforest():
+def test_iforest(global_random_seed):
     """Check Isolation Forest for various parameter settings."""
     X_train = np.array([[0, 1], [1, 2]])
     X_test = np.array([[2, 1], [1, 1]])
@@ -55,7 +55,7 @@ def test_iforest():
 
     with ignore_warnings():
         for params in grid:
-            IsolationForest(random_state=rng, **params).fit(X_train).predict(X_test)
+            IsolationForest(random_state=global_random_seed, **params).fit(X_train).predict(X_test)
 
 
 def test_iforest_sparse(global_random_seed):
@@ -190,12 +190,12 @@ def test_iforest_performance(global_random_seed):
 
 
 @pytest.mark.parametrize("contamination", [0.25, "auto"])
-def test_iforest_works(contamination):
+def test_iforest_works(contamination, global_random_seed):
     # toy sample (the last two samples are outliers)
-    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]
+    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [7, 4], [-5, 9]]
 
     # Test IsolationForest
-    clf = IsolationForest(random_state=rng, contamination=contamination)
+    clf = IsolationForest(random_state=global_random_seed, contamination=contamination)
     clf.fit(X)
     decision_func = -clf.decision_function(X)
     pred = clf.predict(X)

From 29842dd348eea8f5c902ed832e2d8e102b7483e2 Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Fri, 6 May 2022 21:30:11 +0800
Subject: [PATCH 07/13] clean up code in train_test_split

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/ensemble/tests/test_iforest.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 74d854e7d6b19..af14dbd77b379 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -61,9 +61,7 @@ def test_iforest(global_random_seed):
 def test_iforest_sparse(global_random_seed):
     """Check IForest for various parameter settings on sparse input."""
     rng = check_random_state(global_random_seed)
-    X_train, X_test, y_train, y_test = train_test_split(
-        diabetes.data[:50], diabetes.target[:50], random_state=rng
-    )
+    X_train, X_test = train_test_split(diabetes.data[:50], random_state=rng)
     grid = ParameterGrid({"max_samples": [0.5, 1.0], "bootstrap": [True, False]})
 
     for sparse_format in [csc_matrix, csr_matrix]:

From feee7e7ad17760857f2feb003470bdb5bf39e918 Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Fri, 6 May 2022 21:30:29 +0800
Subject: [PATCH 08/13] clean up code in train_test_split

Co-authored-by: Guillaume Lemaitre <g.lemaitre58@gmail.com>
---
 sklearn/ensemble/tests/test_iforest.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index af14dbd77b379..4721f6bd38f08 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -145,9 +145,7 @@ def test_iforest_parallel_regression(global_random_seed):
     """Check parallel regression."""
     rng = check_random_state(global_random_seed)
 
-    X_train, X_test, y_train, y_test = train_test_split(
-        diabetes.data, diabetes.target, random_state=rng
-    )
+    X_train, X_test, y_train, y_test = train_test_split(diabetes.data, random_state=rng)
 
     ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
 

From e21f7f731fb404b7232ca87dac6f34e430f69bb5 Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Fri, 6 May 2022 21:37:59 +0800
Subject: [PATCH 09/13] clean up train_test_split & fix failing tests

---
 sklearn/ensemble/tests/test_iforest.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 4721f6bd38f08..1d2c45e1878ce 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -55,7 +55,9 @@ def test_iforest(global_random_seed):
 
     with ignore_warnings():
         for params in grid:
-            IsolationForest(random_state=global_random_seed, **params).fit(X_train).predict(X_test)
+            IsolationForest(random_state=global_random_seed, **params).fit(
+                X_train
+            ).predict(X_test)
 
 
 def test_iforest_sparse(global_random_seed):
@@ -145,7 +147,7 @@ def test_iforest_parallel_regression(global_random_seed):
     """Check parallel regression."""
     rng = check_random_state(global_random_seed)
 
-    X_train, X_test, y_train, y_test = train_test_split(diabetes.data, random_state=rng)
+    X_train, X_test = train_test_split(diabetes.data, random_state=rng)
 
     ensemble = IsolationForest(n_jobs=3, random_state=global_random_seed).fit(X_train)
 
@@ -284,7 +286,7 @@ def test_iforest_warm_start():
 )
 @pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
 def test_iforest_chunks_works1(mocked_get_chunk, contamination, n_predict_calls):
-    test_iforest_works(contamination)
+    test_iforest_works(contamination, global_random_seed=0)
     assert mocked_get_chunk.call_count == n_predict_calls
 
 
@@ -295,7 +297,7 @@ def test_iforest_chunks_works1(mocked_get_chunk, contamination, n_predict_calls)
 )
 @pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
 def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls):
-    test_iforest_works(contamination)
+    test_iforest_works(contamination, global_random_seed=0)
     assert mocked_get_chunk.call_count == n_predict_calls
 
 
From 66fb5efa080d926c6c39e88bccc81343c194d6f4 Mon Sep 17 00:00:00 2001
From: Maxwell <zl2480@columbia.edu>
Date: Sun, 8 May 2022 14:55:59 +0800
Subject: [PATCH 10/13] remove global random vareiable

---
 sklearn/ensemble/tests/test_iforest.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 1d2c45e1878ce..436721fc7a8ee 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -27,21 +27,10 @@
 from scipy.sparse import csc_matrix, csr_matrix
 from unittest.mock import Mock, patch
 
-rng = check_random_state(0)
 
-# load the iris dataset
-# and randomly permute it
+# load iris & diabetes dataset
 iris = load_iris()
-perm = rng.permutation(iris.target.size)
-iris.data = iris.data[perm]
-iris.target = iris.target[perm]
-
-# also load the diabetes dataset
-# and randomly permute it
 diabetes = load_diabetes()
-perm = rng.permutation(diabetes.target.size)
-diabetes.data = diabetes.data[perm]
-diabetes.target = diabetes.target[perm]
 
 
 def test_iforest(global_random_seed):

From 39c932c2c8dafb20b283a440e8e7d3488b3264aa Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 17 May 2022 10:34:10 +0200
Subject: [PATCH 11/13] Use the global_random_seed fixture also in
 test_iforest_chunks_works*

---
 sklearn/ensemble/tests/test_iforest.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/sklearn/ensemble/tests/test_iforest.py b/sklearn/ensemble/tests/test_iforest.py
index 436721fc7a8ee..7be5f4aa80961 100644
--- a/sklearn/ensemble/tests/test_iforest.py
+++ b/sklearn/ensemble/tests/test_iforest.py
@@ -268,25 +268,29 @@ def test_iforest_warm_start():
 
 
 # mock get_chunk_n_rows to actually test more than one chunk (here one
-# chunk = 3 rows:
+# chunk has 3 rows):
 @patch(
     "sklearn.ensemble._iforest.get_chunk_n_rows",
     side_effect=Mock(**{"return_value": 3}),
 )
 @pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
-def test_iforest_chunks_works1(mocked_get_chunk, contamination, n_predict_calls):
-    test_iforest_works(contamination, global_random_seed=0)
+def test_iforest_chunks_works1(
+    mocked_get_chunk, contamination, n_predict_calls, global_random_seed
+):
+    test_iforest_works(contamination, global_random_seed)
     assert mocked_get_chunk.call_count == n_predict_calls
 
 
-# idem with chunk_size = 5 rows
+# idem with chunk_size = 10 rows
 @patch(
     "sklearn.ensemble._iforest.get_chunk_n_rows",
     side_effect=Mock(**{"return_value": 10}),
 )
 @pytest.mark.parametrize("contamination, n_predict_calls", [(0.25, 3), ("auto", 2)])
-def test_iforest_chunks_works2(mocked_get_chunk, contamination, n_predict_calls):
-    test_iforest_works(contamination, global_random_seed=0)
+def test_iforest_chunks_works2(
+    mocked_get_chunk, contamination, n_predict_calls, global_random_seed
+):
+    test_iforest_works(contamination, global_random_seed)
     assert mocked_get_chunk.call_count == n_predict_calls
 
 
From 0fa49bc49bf05124291b11500ab73e9af367bc6c Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 17 May 2022 10:34:41 +0200
Subject: [PATCH 12/13] [all random seeds]


From bca0de43cac349f73d6bc4240bb087387c4b98e9 Mon Sep 17 00:00:00 2001
From: Olivier Grisel <olivier.grisel@ensta.org>
Date: Tue, 17 May 2022 10:40:07 +0200
Subject: [PATCH 13/13] [all random seeds]

test_iforest_sparse
test_iforest_parallel_regression
test_iforest_performance
test_iforest_chunks_works1
test_iforest_chunks_works2