From 5f6c6149e36bc36f5ed86c55b27c0b7e6136099e Mon Sep 17 00:00:00 2001 From: sergul Date: Mon, 16 Jul 2018 06:17:25 -0400 Subject: [PATCH 01/11] test for None and ones for sample_weight added --- sklearn/utils/estimator_checks.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d25abbe6377db..bfcf58737f75a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -88,6 +88,7 @@ def _yield_non_meta_checks(name, estimator): yield check_dtype_object yield check_sample_weights_pandas_series yield check_sample_weights_list + yield check_sample_weight_invariance yield check_estimators_fit_returns_self yield partial(check_estimators_fit_returns_self, readonly_memmap=True) yield check_complex_data @@ -553,6 +554,24 @@ def check_sample_weights_list(name, estimator_orig): estimator.fit(X, y, sample_weight=sample_weight) +@ignore_warnings(category=(DeprecationWarning, FutureWarning)) +def check_sample_weight_invariance(name, estimator_orig): + if has_fit_parameter(estimator_orig, "sample_weight"): + estimator1 = clone(estimator_orig) + estimator2 = clone(estimator_orig) + X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], + [2, 1], [2, 2], [2, 3], [2, 4]]) + y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) + estimator1.fit(X, y, sample_weight=None) + estimator2.fit(X, y, sample_weight=np.ones(shape=len(y))) + X_pred1 = estimator1.predict(X) + X_pred2 = estimator2.predict(X) + try: + assert_array_equal(X_pred1, X_pred2) + except ValueError: + raise ValueError("For %s sample_weight=None is not equivalent to " + "sample_weight=ones" % name) + @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible From e47a059306b7b1d53af8e30172ed2b252a786f15 Mon Sep 17 00:00:00 2001 From: sergul Date: Mon, 16 Jul 2018 06:17:25 -0400 Subject: [PATCH 02/11] test for None and ones for sample_weight added --- sklearn/utils/estimator_checks.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d25abbe6377db..bfcf58737f75a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -88,6 +88,7 @@ def _yield_non_meta_checks(name, estimator): yield check_dtype_object yield check_sample_weights_pandas_series yield check_sample_weights_list + yield check_sample_weight_invariance yield check_estimators_fit_returns_self yield partial(check_estimators_fit_returns_self, readonly_memmap=True) yield check_complex_data @@ -553,6 +554,24 @@ def check_sample_weights_list(name, estimator_orig): estimator.fit(X, y, sample_weight=sample_weight) +@ignore_warnings(category=(DeprecationWarning, FutureWarning)) +def check_sample_weight_invariance(name, estimator_orig): + if has_fit_parameter(estimator_orig, "sample_weight"): + estimator1 = clone(estimator_orig) + estimator2 = clone(estimator_orig) + X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], + [2, 1], [2, 2], [2, 3], [2, 4]]) + y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) + estimator1.fit(X, y, sample_weight=None) + estimator2.fit(X, y, sample_weight=np.ones(shape=len(y))) + X_pred1 = estimator1.predict(X) + X_pred2 = estimator2.predict(X) + try: + assert_array_equal(X_pred1, X_pred2) + except ValueError: + raise ValueError("For %s sample_weight=None is not equivalent to " + "sample_weight=ones" % name) + @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): # check that estimators treat dtype object as numeric if possible From 688be6af17eda17f54b3193c771d921dcc989412 Mon Sep 17 00:00:00 2001 From: sergul Date: Mon, 16 Jul 2018 10:47:22 -0400 Subject: [PATCH 03/11] skip KMeans based estimators --- sklearn/utils/estimator_checks.py | 46 ++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index bfcf58737f75a..d54532c6644fc 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -556,21 +556,41 @@ def check_sample_weights_list(name, estimator_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_sample_weight_invariance(name, estimator_orig): - if has_fit_parameter(estimator_orig, "sample_weight"): + if (has_fit_parameter(estimator_orig, "sample_weight") and + name not in ["KMeans", "MiniBatchKMeans"]): estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) - X = np.array([[1, 1], [1, 2], [1, 3], [1, 4], - [2, 1], [2, 2], [2, 3], [2, 4]]) - y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) - estimator1.fit(X, y, sample_weight=None) - estimator2.fit(X, y, sample_weight=np.ones(shape=len(y))) - X_pred1 = estimator1.predict(X) - X_pred2 = estimator2.predict(X) - try: - assert_array_equal(X_pred1, X_pred2) - except ValueError: - raise ValueError("For %s sample_weight=None is not equivalent to " - "sample_weight=ones" % name) + + X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], + [2, 1], [2, 1], [2, 1], [2, 1], + [3, 3], [3, 3], [3, 3], [3, 3], + [4, 1], [4, 1], [4, 1], [4, 1]]) + y = np.array([1, 1, 1, 1, 2, 2, 2, 2, + 1, 1, 1, 1, 2, 2, 2, 2]) + + if has_fit_parameter(estimator_orig, "random_state"): + estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)), random_state=0) + estimator2.fit(X, y=y, sample_weight=None, random_state=0) + else: + estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) + estimator2.fit(X, y=y, sample_weight=None) + + if hasattr(estimator_orig, "predict"): + X_pred1 = estimator1.predict(X) + X_pred2 = estimator2.predict(X) + try: + assert_allclose(X_pred1, X_pred2, rtol=0.5) + except ValueError: + raise ValueError("For %s sample_weight=None is not equivalent to " + "sample_weight=ones" % name) + if hasattr(estimator_orig, "transform"): + X_pred1 = estimator1.transform(X) + X_pred2 = estimator2.transform(X) + try: + assert_allclose(X_pred1, X_pred2, rtol=0.5) + except ValueError: + raise ValueError("For %s sample_weight=None is not equivalent to " + "sample_weight=ones" % name) @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): From 3533647e365ffbae6b9c8cfde67b4f5dcabd7a94 Mon Sep 17 00:00:00 2001 From: sergul Date: Mon, 16 Jul 2018 10:59:22 -0400 Subject: [PATCH 04/11] cleaning --- sklearn/utils/estimator_checks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 5234e92323da8..744406f9a6829 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -556,7 +556,6 @@ def check_sample_weights_list(name, estimator_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_sample_weight_invariance(name, estimator_orig): -<<<<<<< HEAD if (has_fit_parameter(estimator_orig, "sample_weight") and name not in ["KMeans", "MiniBatchKMeans"]): estimator1 = clone(estimator_orig) @@ -592,7 +591,7 @@ def check_sample_weight_invariance(name, estimator_orig): except ValueError: raise ValueError("For %s sample_weight=None is not equivalent to " "sample_weight=ones" % name) - + @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) def check_dtype_object(name, estimator_orig): From 0738abc8a4f682f998a6893d4eb0cd4fc6b48706 Mon Sep 17 00:00:00 2001 From: sergul Date: Mon, 16 Jul 2018 11:38:32 -0400 Subject: [PATCH 05/11] dtype and X_trans --- sklearn/utils/estimator_checks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 744406f9a6829..873fe91bace60 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -564,9 +564,9 @@ def check_sample_weight_invariance(name, estimator_orig): X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], [2, 1], [2, 1], [2, 1], [2, 1], [3, 3], [3, 3], [3, 3], [3, 3], - [4, 1], [4, 1], [4, 1], [4, 1]]) + [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float')) y = np.array([1, 1, 1, 1, 2, 2, 2, 2, - 1, 1, 1, 1, 2, 2, 2, 2]) + 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('float')) if has_fit_parameter(estimator_orig, "random_state"): estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)), random_state=0) @@ -584,10 +584,10 @@ def check_sample_weight_invariance(name, estimator_orig): raise ValueError("For %s sample_weight=None is not equivalent to " "sample_weight=ones" % name) if hasattr(estimator_orig, "transform"): - X_pred1 = estimator1.transform(X) - X_pred2 = estimator2.transform(X) + X_trans1 = estimator1.transform(X) + X_trans2 = estimator2.transform(X) try: - assert_allclose(X_pred1, X_pred2, rtol=0.5) + assert_allclose(X_trans1, X_trans2, rtol=0.5) except ValueError: raise ValueError("For %s sample_weight=None is not equivalent to " "sample_weight=ones" % name) From 1d93961215937b45dfbdc07378578d068c643494 Mon Sep 17 00:00:00 2001 From: sergul Date: Tue, 17 Jul 2018 02:12:52 -0400 Subject: [PATCH 06/11] after second reviews --- sklearn/utils/estimator_checks.py | 41 ++++++++++++------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 873fe91bace60..d3ea1b5b866f4 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -556,41 +556,32 @@ def check_sample_weights_list(name, estimator_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) def check_sample_weight_invariance(name, estimator_orig): + # check that the estimators yield same results for + # unit weights and no weights if (has_fit_parameter(estimator_orig, "sample_weight") and name not in ["KMeans", "MiniBatchKMeans"]): estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) + set_random_state(estimator1, random_state=42) + set_random_state(estimator2, random_state=42) X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], [2, 1], [2, 1], [2, 1], [2, 1], [3, 3], [3, 3], [3, 3], [3, 3], [4, 1], [4, 1], [4, 1], [4, 1]], dtype=np.dtype('float')) y = np.array([1, 1, 1, 1, 2, 2, 2, 2, - 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('float')) - - if has_fit_parameter(estimator_orig, "random_state"): - estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y)), random_state=0) - estimator2.fit(X, y=y, sample_weight=None, random_state=0) - else: - estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) - estimator2.fit(X, y=y, sample_weight=None) - - if hasattr(estimator_orig, "predict"): - X_pred1 = estimator1.predict(X) - X_pred2 = estimator2.predict(X) - try: - assert_allclose(X_pred1, X_pred2, rtol=0.5) - except ValueError: - raise ValueError("For %s sample_weight=None is not equivalent to " - "sample_weight=ones" % name) - if hasattr(estimator_orig, "transform"): - X_trans1 = estimator1.transform(X) - X_trans2 = estimator2.transform(X) - try: - assert_allclose(X_trans1, X_trans2, rtol=0.5) - except ValueError: - raise ValueError("For %s sample_weight=None is not equivalent to " - "sample_weight=ones" % name) + 1, 1, 1, 1, 2, 2, 2, 2], dtype=np.dtype('int')) + + estimator1.fit(X, y=y, sample_weight=np.ones(shape=len(y))) + estimator2.fit(X, y=y, sample_weight=None) + + for method in ["predict", "transform"]: + if hasattr(estimator_orig, method): + X_pred1 = getattr(estimator1, method)(X) + X_pred2 = getattr(estimator2, method)(X) + assert_allclose(X_pred1, X_pred2, rtol=0.5, + err_msg="For %s sample_weight=None is not equivalent to " + "sample_weight=ones" % name) @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) From 6b7f20a1d4793d29b9486aa87d4c0980733d008e Mon Sep 17 00:00:00 2001 From: sergul Date: Tue, 17 Jul 2018 03:38:22 -0400 Subject: [PATCH 07/11] pyflake errors --- sklearn/utils/estimator_checks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index d3ea1b5b866f4..c003aff3996af 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -559,7 +559,7 @@ def check_sample_weight_invariance(name, estimator_orig): # check that the estimators yield same results for # unit weights and no weights if (has_fit_parameter(estimator_orig, "sample_weight") and - name not in ["KMeans", "MiniBatchKMeans"]): + name not in ["KMeans", "MiniBatchKMeans"]): estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) set_random_state(estimator1, random_state=42) @@ -580,8 +580,8 @@ def check_sample_weight_invariance(name, estimator_orig): X_pred1 = getattr(estimator1, method)(X) X_pred2 = getattr(estimator2, method)(X) assert_allclose(X_pred1, X_pred2, rtol=0.5, - err_msg="For %s sample_weight=None is not equivalent to " - "sample_weight=ones" % name) + err_msg="For %s sample_weight=None is not equivalent" + " to sample_weight=ones" % name) @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) From ad50ab98fa3d0bfab7f68478407dd4a1f09d4fa7 Mon Sep 17 00:00:00 2001 From: sergul Date: Tue, 17 Jul 2018 07:03:03 -0400 Subject: [PATCH 08/11] fixed pairwise error --- sklearn/utils/estimator_checks.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 481b7c511bee0..3af29350067b8 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -88,7 +88,7 @@ def _yield_non_meta_checks(name, estimator): yield check_dtype_object yield check_sample_weights_pandas_series yield check_sample_weights_list - yield check_sample_weight_invariance + yield check_sample_weights_invariance yield check_estimators_fit_returns_self yield partial(check_estimators_fit_returns_self, readonly_memmap=True) yield check_complex_data @@ -556,10 +556,12 @@ def check_sample_weights_list(name, estimator_orig): @ignore_warnings(category=(DeprecationWarning, FutureWarning)) -def check_sample_weight_invariance(name, estimator_orig): +def check_sample_weights_invariance(name, estimator_orig): # check that the estimators yield same results for # unit weights and no weights if (has_fit_parameter(estimator_orig, "sample_weight") and + not (hasattr(estimator_orig, "_pairwise") + and estimator_orig._pairwise) and name not in ["KMeans", "MiniBatchKMeans"]): estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) From 033a84a123d197b36806eda161216f0d2fbd8f03 Mon Sep 17 00:00:00 2001 From: Gael Varoquaux Date: Tue, 17 Jul 2018 15:03:16 +0200 Subject: [PATCH 09/11] PEP8: line too long --- sklearn/utils/estimator_checks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 3af29350067b8..6246d45974e45 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -583,8 +583,8 @@ def check_sample_weights_invariance(name, estimator_orig): X_pred1 = getattr(estimator1, method)(X) X_pred2 = getattr(estimator2, method)(X) assert_allclose(X_pred1, X_pred2, rtol=0.5, - err_msg="For %s sample_weight=None is not equivalent" - " to sample_weight=ones" % name) + err_msg="For %s sample_weight=None is not equivalent" + " to sample_weight=ones" % name) @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) From ef0371c4aecda03cadb3a402c1975aa71d4dc6ac Mon Sep 17 00:00:00 2001 From: sergul Date: Tue, 17 Jul 2018 09:59:07 -0400 Subject: [PATCH 10/11] add comments and pep8 issues --- sklearn/utils/estimator_checks.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 6246d45974e45..00abf31187d0a 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -563,6 +563,8 @@ def check_sample_weights_invariance(name, estimator_orig): not (hasattr(estimator_orig, "_pairwise") and estimator_orig._pairwise) and name not in ["KMeans", "MiniBatchKMeans"]): + # We skip pairwise because the data is not pairwise + # KMeans and MiniBatchKMeans were unstable; hence skipped. estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) set_random_state(estimator1, random_state=42) @@ -583,8 +585,9 @@ def check_sample_weights_invariance(name, estimator_orig): X_pred1 = getattr(estimator1, method)(X) X_pred2 = getattr(estimator2, method)(X) assert_allclose(X_pred1, X_pred2, rtol=0.5, - err_msg="For %s sample_weight=None is not equivalent" - " to sample_weight=ones" % name) + err_msg="For %s sample_weight=None is not" + " equivalent to sample_weight=ones" + % name) @ignore_warnings(category=(DeprecationWarning, FutureWarning, UserWarning)) From 1ca44abb85e0915b28a85bd6ab57c7110d006e30 Mon Sep 17 00:00:00 2001 From: sergul Date: Tue, 17 Jul 2018 10:20:31 -0400 Subject: [PATCH 11/11] KMeans methods are not skipped anymore --- sklearn/utils/estimator_checks.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py index 00abf31187d0a..1db6031e8d702 100644 --- a/sklearn/utils/estimator_checks.py +++ b/sklearn/utils/estimator_checks.py @@ -561,14 +561,13 @@ def check_sample_weights_invariance(name, estimator_orig): # unit weights and no weights if (has_fit_parameter(estimator_orig, "sample_weight") and not (hasattr(estimator_orig, "_pairwise") - and estimator_orig._pairwise) and - name not in ["KMeans", "MiniBatchKMeans"]): + and estimator_orig._pairwise)): # We skip pairwise because the data is not pairwise - # KMeans and MiniBatchKMeans were unstable; hence skipped. + estimator1 = clone(estimator_orig) estimator2 = clone(estimator_orig) - set_random_state(estimator1, random_state=42) - set_random_state(estimator2, random_state=42) + set_random_state(estimator1, random_state=0) + set_random_state(estimator2, random_state=0) X = np.array([[1, 3], [1, 3], [1, 3], [1, 3], [2, 1], [2, 1], [2, 1], [2, 1],