From 553e3cb27e2a2c12213b8ce2b0bbd9236014b675 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Oct 2023 09:40:29 +0200 Subject: [PATCH 1/9] FIX Make DecisionTree pickle deterministic --- sklearn/tree/_tree.pyx | 4 +++- sklearn/tree/tests/test_tree.py | 13 +++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index b4ce56a4d2a0b..ef3a99a69ac1f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -908,11 +908,13 @@ cdef class Tree: safe_realloc(&self.nodes, capacity) safe_realloc(&self.value, capacity * self.value_stride) - # value memory is initialised to 0 to enable classifier argmax if capacity > self.capacity: + # value memory is initialised to 0 to enable classifier argmax memset((self.value + self.capacity * self.value_stride), 0, (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) + # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) + memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 2543a2f2a39ad..0ffe3e50b14b2 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2601,3 +2601,16 @@ def test_sample_weight_non_uniform(make_data, Tree): tree_samples_removed.fit(X[1::2, :], y[1::2]) assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X)) + + +def test_deterministic_pickle(): + tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) + tree1.fit(X, y) + + tree2 = DecisionTreeClassifier(random_state=0).fit(X, y) + tree2.fit(X, y) + + pickle1 = pickle.dumps(tree1) + pickle2 = pickle.dumps(tree2) + + assert pickle1 == pickle2 From 3fe9772715aa88cd88381b5bd19d70412808e1eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Oct 2023 09:58:56 +0200 Subject: [PATCH 2/9] Add changelog --- doc/whats_new/v1.4.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 3cdf299280ed7..07f20a10faf34 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -424,6 +424,9 @@ Changelog :pr:`13649` by :user:`Samuel Ronsin `, initiated by :user:`Patrick O'Reilly `. +- |Fix| Make decision tree pickles deterministic. :pr:`27580` by :user:`Loïc + Estève `. + :mod:`sklearn.utils` .................... From da596d717e39ae882809ea5b5b797edf0d06dd03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Oct 2023 15:28:09 +0200 Subject: [PATCH 3/9] Comment out fix --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ef3a99a69ac1f..96106e6ef01ec 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -914,7 +914,7 @@ cdef class Tree: (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) - memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) + # memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: From ea815a079b94120cdf8ca041e5e0f652610680ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Oct 2023 16:52:30 +0200 Subject: [PATCH 4/9] Revert "Comment out fix" This reverts commit da596d717e39ae882809ea5b5b797edf0d06dd03. --- sklearn/tree/_tree.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 96106e6ef01ec..ef3a99a69ac1f 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -914,7 +914,7 @@ cdef class Tree: (capacity - self.capacity) * self.value_stride * sizeof(float64_t)) # node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct) - # memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) + memset((self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node)) # if capacity smaller than node_count, adjust the counter if capacity < self.node_count: From c8b16aea654b4842dc5ce079b93b5dabfc48bcb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Oct 2023 16:58:33 +0200 Subject: [PATCH 5/9] update changelog --- doc/whats_new/v1.4.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 07f20a10faf34..7b45b7b7dd0c9 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -424,8 +424,8 @@ Changelog :pr:`13649` by :user:`Samuel Ronsin `, initiated by :user:`Patrick O'Reilly `. -- |Fix| Make decision tree pickles deterministic. :pr:`27580` by :user:`Loïc - Estève `. +- |Fix| Do not leak data via non-initialized memory in decision tree pickle files and make + the generation of those files deterministic. :pr:`27580` by :user:`Loïc Estève `. :mod:`sklearn.utils` .................... From 6ffaddbe413b271772ac35d7be3a6a63428981d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Oct 2023 16:59:49 +0200 Subject: [PATCH 6/9] Fix copy and paste gone wrong --- sklearn/tree/tests/test_tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 0ffe3e50b14b2..6ec1dd144056b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2604,10 +2604,10 @@ def test_sample_weight_non_uniform(make_data, Tree): def test_deterministic_pickle(): - tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) + tree1 = DecisionTreeClassifier(random_state=0) tree1.fit(X, y) - tree2 = DecisionTreeClassifier(random_state=0).fit(X, y) + tree2 = DecisionTreeClassifier(random_state=0) tree2.fit(X, y) pickle1 = pickle.dumps(tree1) From b5d9b5eb0a3e567880ead394aef75f30de9f1415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Fri, 13 Oct 2023 17:02:02 +0200 Subject: [PATCH 7/9] Fix copy and paste gone wrong --- sklearn/tree/tests/test_tree.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 6ec1dd144056b..a0b6e7915831c 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2604,11 +2604,8 @@ def test_sample_weight_non_uniform(make_data, Tree): def test_deterministic_pickle(): - tree1 = DecisionTreeClassifier(random_state=0) - tree1.fit(X, y) - - tree2 = DecisionTreeClassifier(random_state=0) - tree2.fit(X, y) + tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) + tree2 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) pickle1 = pickle.dumps(tree1) pickle2 = pickle.dumps(tree2) From 05198e00ee66de06735d4e3d8d46fffe91bd34b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 16 Oct 2023 10:24:43 +0200 Subject: [PATCH 8/9] Update sklearn/tree/tests/test_tree.py Co-authored-by: Olivier Grisel --- sklearn/tree/tests/test_tree.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index a0b6e7915831c..f876738ebba2b 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2604,6 +2604,9 @@ def test_sample_weight_non_uniform(make_data, Tree): def test_deterministic_pickle(): + # Non-regression test for: + # https://github.com/scikit-learn/scikit-learn/issues/27268 + # Uninitialised memory would lead to the two pickle strings being different. tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) tree2 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target) From 5b8e1d8e257bd13d58068eea3f24bcb63cf546f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Est=C3=A8ve?= Date: Mon, 16 Oct 2023 13:29:03 +0200 Subject: [PATCH 9/9] Move entry to 1.3.2 --- doc/whats_new/v1.3.rst | 17 +++++++++++++++++ doc/whats_new/v1.4.rst | 3 --- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst index ddb6a2ebe0016..1a445b7436201 100644 --- a/doc/whats_new/v1.3.rst +++ b/doc/whats_new/v1.3.rst @@ -2,6 +2,23 @@ .. currentmodule:: sklearn +.. _changes_1_3_2: + +Version 1.3.2 +============= + +**October 2023** + +Changelog +--------- + +:mod:`sklearn.tree` +................... + +- |Fix| Do not leak data via non-initialized memory in decision tree pickle files and make + the generation of those files deterministic. :pr:`27580` by :user:`Loïc Estève `. + + .. _changes_1_3_1: Version 1.3.1 diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst index 7b45b7b7dd0c9..3cdf299280ed7 100644 --- a/doc/whats_new/v1.4.rst +++ b/doc/whats_new/v1.4.rst @@ -424,9 +424,6 @@ Changelog :pr:`13649` by :user:`Samuel Ronsin `, initiated by :user:`Patrick O'Reilly `. -- |Fix| Do not leak data via non-initialized memory in decision tree pickle files and make - the generation of those files deterministic. :pr:`27580` by :user:`Loïc Estève `. - :mod:`sklearn.utils` ....................