From 55f6bc51942dd8d4fb9d24014cc8bb2c522cc43f Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Wed, 14 Mar 2018 15:27:27 +0100 Subject: [PATCH 1/7] FIX numpy.int overflow in make_classification Sample generator `make_classification()` checks that `2 ** n_informative < n_classes * n_clusters_per_class`. If `n_informative` is given as numpy.int with a value of 64 or larger, `2 ** n_informative` evaluates to 0, and the check fails with a misleading error message. Casting to Python int() avoids this issue. --- sklearn/datasets/samples_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py index 00f15c96446c1..da59da5ce055b 100644 --- a/sklearn/datasets/samples_generator.py +++ b/sklearn/datasets/samples_generator.py @@ -161,7 +161,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, raise ValueError("Number of informative, redundant and repeated " "features must sum to less than the number of total" " features") - if 2 ** n_informative < n_classes * n_clusters_per_class: + if 2 ** int(n_informative) < n_classes * n_clusters_per_class: raise ValueError("n_classes * n_clusters_per_class must" " be smaller or equal 2 ** n_informative") if weights and len(weights) not in [n_classes, n_classes - 1]: From 74586047983af192ff2ed89cac2be97ffdb49a3b Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 16 Mar 2018 18:15:04 +0100 Subject: [PATCH 2/7] Using log2 --- sklearn/datasets/samples_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py index da59da5ce055b..aca7da909e684 100644 --- a/sklearn/datasets/samples_generator.py +++ b/sklearn/datasets/samples_generator.py @@ -161,7 +161,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, raise ValueError("Number of informative, redundant and repeated " "features must sum to less than the number of total" " features") - if 2 ** int(n_informative) < n_classes * n_clusters_per_class: + if n_informative < np.log2(n_classes * n_clusters_per_class): raise ValueError("n_classes * n_clusters_per_class must" " be smaller or equal 2 ** n_informative") if weights and len(weights) not in [n_classes, n_classes - 1]: From 19f2d8b180256e7bcf7e3c8a37101735783ccfcd Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Fri, 16 Mar 2018 18:17:13 +0100 Subject: [PATCH 3/7] TST make_classification() with numpy.int Parameter n_informative as numpy.int --- sklearn/datasets/tests/test_samples_generator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index c5a0c48b16ed0..7a809d34288d7 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -84,7 +84,8 @@ def test_make_classification_informative_features(): (2, [1/4] * 4, 1), (2, [1/2] * 2, 2), (2, [3/4, 1/4], 2), - (10, [1/3] * 3, 10) + (10, [1/3] * 3, 10), + (np.array(64), [1], 1) ]: n_classes = len(weights) n_clusters = n_classes * n_clusters_per_class From bf1f296996e44fe0f158e9fbfc3c9504af5dd046 Mon Sep 17 00:00:00 2001 From: feldbauer Date: Mon, 19 Mar 2018 14:42:43 +0100 Subject: [PATCH 4/7] Comment on log2 --- sklearn/datasets/samples_generator.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py index aca7da909e684..7bab8f720c46e 100644 --- a/sklearn/datasets/samples_generator.py +++ b/sklearn/datasets/samples_generator.py @@ -161,6 +161,7 @@ def make_classification(n_samples=100, n_features=20, n_informative=2, raise ValueError("Number of informative, redundant and repeated " "features must sum to less than the number of total" " features") + # Use log2 to avoid overflow errors if n_informative < np.log2(n_classes * n_clusters_per_class): raise ValueError("n_classes * n_clusters_per_class must" " be smaller or equal 2 ** n_informative") From f1085433b4aa6669f777c7d2f7b6a51f1797fa1b Mon Sep 17 00:00:00 2001 From: feldbauer Date: Mon, 19 Mar 2018 14:43:41 +0100 Subject: [PATCH 5/7] Lower hypercube center precision --- sklearn/datasets/tests/test_samples_generator.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index 7a809d34288d7..eb7c30b22745a 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -85,7 +85,7 @@ def test_make_classification_informative_features(): (2, [1/2] * 2, 2), (2, [3/4, 1/4], 2), (10, [1/3] * 3, 10), - (np.array(64), [1], 1) + (np.int(64), [1], 1) ]: n_classes = len(weights) n_clusters = n_classes * n_clusters_per_class @@ -129,9 +129,10 @@ def test_make_classification_informative_features(): for cluster in range(len(unique_signs)): centroid = X[cluster_index == cluster].mean(axis=0) if hypercube: - assert_array_almost_equal(np.abs(centroid), - [class_sep] * n_informative, - decimal=0, + assert_array_almost_equal(np.abs(centroid) / class_sep, + np.array([class_sep] \ + * n_informative) / class_sep, + decimal=5, err_msg="Clusters are not " "centered on hypercube " "vertices") From 54432a71d1e2e080b88e060989726e715077bce3 Mon Sep 17 00:00:00 2001 From: feldbauer Date: Mon, 19 Mar 2018 15:15:28 +0100 Subject: [PATCH 6/7] fix pyflakes issue --- sklearn/datasets/tests/test_samples_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index eb7c30b22745a..f3050d9426c6b 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -130,7 +130,7 @@ def test_make_classification_informative_features(): centroid = X[cluster_index == cluster].mean(axis=0) if hypercube: assert_array_almost_equal(np.abs(centroid) / class_sep, - np.array([class_sep] \ + np.array([class_sep] * n_informative) / class_sep, decimal=5, err_msg="Clusters are not " From d3396e4da31a1820c5f3cb94c7b25808690f66ef Mon Sep 17 00:00:00 2001 From: Roman Feldbauer Date: Wed, 8 Aug 2018 10:19:22 +0200 Subject: [PATCH 7/7] Simplification and consistency Replaced confusing Python list operations with concise numpy statement Consistent assertions in both hypercube branches --- sklearn/datasets/tests/test_samples_generator.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sklearn/datasets/tests/test_samples_generator.py b/sklearn/datasets/tests/test_samples_generator.py index f3050d9426c6b..1e1f110d9c41b 100644 --- a/sklearn/datasets/tests/test_samples_generator.py +++ b/sklearn/datasets/tests/test_samples_generator.py @@ -130,8 +130,7 @@ def test_make_classification_informative_features(): centroid = X[cluster_index == cluster].mean(axis=0) if hypercube: assert_array_almost_equal(np.abs(centroid) / class_sep, - np.array([class_sep] - * n_informative) / class_sep, + np.ones(n_informative), decimal=5, err_msg="Clusters are not " "centered on hypercube " @@ -139,10 +138,10 @@ def test_make_classification_informative_features(): else: assert_raises(AssertionError, assert_array_almost_equal, - np.abs(centroid), - [class_sep] * n_informative, - decimal=0, - err_msg="Clusters should not be cenetered " + np.abs(centroid) / class_sep, + np.ones(n_informative), + decimal=5, + err_msg="Clusters should not be centered " "on hypercube vertices") assert_raises(ValueError, make, n_features=2, n_informative=2, n_classes=5,