From 60e25fe359a3dcb47991b307b9f140a3392f6357 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 10 Jan 2019 16:14:32 +0000 Subject: [PATCH 1/7] Update docstring of preprocessing.StandardScaler so it's clear that the estimator of the standard deviation is the biased one --- sklearn/preprocessing/data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index b78f1c11fab96..006c402b974bb 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -478,7 +478,8 @@ class StandardScaler(BaseEstimator, TransformerMixin): where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if - `with_std=False`. + `with_std=False`. Note that `s` is a biased estimator of the standard + deviation, equivalent to numpy.var(x, ddof=0). Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and From b406ded946e5cd812f5d97fd054ce2e1ecc9d611 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 10 Jan 2019 16:21:08 +0000 Subject: [PATCH 2/7] Add np.sqrt to calculation of standard deviation --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 006c402b974bb..3b76a997e3406 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -479,7 +479,7 @@ class StandardScaler(BaseEstimator, TransformerMixin): where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if `with_std=False`. Note that `s` is a biased estimator of the standard - deviation, equivalent to numpy.var(x, ddof=0). + deviation, equivalent to np.sqrt(numpy.var(x, ddof=0)). Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and From c91101e9fa38b648b44caf276deb15076b7f3ddb Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Thu, 10 Jan 2019 16:23:50 +0000 Subject: [PATCH 3/7] Change np to numpy --- sklearn/preprocessing/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 3b76a997e3406..f3310b7b7ec8d 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -479,7 +479,7 @@ class StandardScaler(BaseEstimator, TransformerMixin): where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if `with_std=False`. Note that `s` is a biased estimator of the standard - deviation, equivalent to np.sqrt(numpy.var(x, ddof=0)). + deviation, equivalent to numpy.sqrt(numpy.var(x, ddof=0)). Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and From 06f807df4f2168fa4f77d25f9ee630c135c5fc87 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 11 Jan 2019 10:18:17 +0000 Subject: [PATCH 4/7] Add message explaning it's unlikely that the choice of estimator would affect model performance. --- sklearn/preprocessing/data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index f3310b7b7ec8d..529a2c76ebbff 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -479,7 +479,9 @@ class StandardScaler(BaseEstimator, TransformerMixin): where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if `with_std=False`. Note that `s` is a biased estimator of the standard - deviation, equivalent to numpy.sqrt(numpy.var(x, ddof=0)). + deviation, equivalent to numpy.sqrt(numpy.var(x, ddof=0)), and that it is + unlikely that using this estimator as opposed its unbiased counterpart + will affect model performance. Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and From a16334d6a7012259a0486dd9428a2d8b81b47384 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 11 Jan 2019 12:47:18 +0000 Subject: [PATCH 5/7] Move note about ddof in StandardScaler to 'Notes' section, and use np.std instead of np.var --- sklearn/preprocessing/data.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 529a2c76ebbff..76684547feee2 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -478,10 +478,7 @@ class StandardScaler(BaseEstimator, TransformerMixin): where `u` is the mean of the training samples or zero if `with_mean=False`, and `s` is the standard deviation of the training samples or one if - `with_std=False`. Note that `s` is a biased estimator of the standard - deviation, equivalent to numpy.sqrt(numpy.var(x, ddof=0)), and that it is - unlikely that using this estimator as opposed its unbiased counterpart - will affect model performance. + `with_std=False`. Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and @@ -577,6 +574,10 @@ class StandardScaler(BaseEstimator, TransformerMixin): ----- NaNs are treated as missing values: disregarded in fit, and maintained in transform. + + We use a biased estimator for the standard deviation, equivalent to + `numpy.std(x, ddof=0)`. Note, however, that the choice of `ddof` is + unlikely to affect model performance. For a comparison of the different scalers, transformers, and normalizers, see :ref:`examples/preprocessing/plot_all_scaling.py From dc9c0ee04dc08af58b55be1e68035caaa04dba4f Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Fri, 11 Jan 2019 13:44:16 +0000 Subject: [PATCH 6/7] Remove 'however' --- sklearn/preprocessing/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 76684547feee2..6d8b1e4959505 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -576,8 +576,8 @@ class StandardScaler(BaseEstimator, TransformerMixin): transform. We use a biased estimator for the standard deviation, equivalent to - `numpy.std(x, ddof=0)`. Note, however, that the choice of `ddof` is - unlikely to affect model performance. + `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to + affect model performance. For a comparison of the different scalers, transformers, and normalizers, see :ref:`examples/preprocessing/plot_all_scaling.py From b62c70d8ceaea6871fe9910c88df071144b00414 Mon Sep 17 00:00:00 2001 From: Marco Gorelli Date: Mon, 14 Jan 2019 10:17:06 +0000 Subject: [PATCH 7/7] Replicate note in the function --- sklearn/preprocessing/data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py index 6d8b1e4959505..9cf2dec48f139 100644 --- a/sklearn/preprocessing/data.py +++ b/sklearn/preprocessing/data.py @@ -123,6 +123,10 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True): NaNs are treated as missing values: disregarded to compute the statistics, and maintained during the data transformation. + We use a biased estimator for the standard deviation, equivalent to + `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to + affect model performance. + For a comparison of the different scalers, transformers, and normalizers, see :ref:`examples/preprocessing/plot_all_scaling.py `.