scikit-learn · ogrisel · Aug 17, 2021 · Aug 16, 2021 · Aug 17, 2021
diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py
@@ -64,8 +64,9 @@
 #
 #     When reporting performance measure on the test set in the discussion, we
 #     instead choose to focus on the mean absolute error that is more
-#     intuitive than the (root) mean squared error. Note however that the best
-#     models for one metric are also the best for the other in this study.
+#     intuitive than the (root) mean squared error. Note, however, that the
+#     best models for one metric are also the best for the other in this
+#     study.
 y = df["count"] / 1000
 
 # %%
@@ -171,11 +172,11 @@
 # let the model know that it should treat those as categorical variables by
 # using a dedicated tree splitting rule. Since we use an ordinal encoder, we
 # pass the list of categorical values explicitly to use a logical order when
-# encoding the categories as integer instead of the lexicographical order. This
-# also has the added benefit of preventing any issue with unknown categories
-# when using cross-validation.
+# encoding the categories as integers instead of the lexicographical order.
+# This also has the added benefit of preventing any issue with unknown
+# categories when using cross-validation.
 #
-# The numerical variable need no preprocessing and, for the sake of simplicity,
+# The numerical variables need no preprocessing and, for the sake of simplicity,
 # we only try the default hyper-parameters for this model:
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import OrdinalEncoder
@@ -243,7 +244,7 @@ def evaluate(model, X, y, cv):
 # of a problem for tree-based models as they can learn a non-monotonic
 # relationship between ordinal input features and the target.
 #
-# This is not the case for linear regression model as we will see in the
+# This is not the case for linear regression models as we will see in the
 # following.
 #
 # Naive linear regression
@@ -279,25 +280,26 @@ def evaluate(model, X, y, cv):
 #
 # The performance is not good: the average error is around 14% of the maximum
 # demand. This is more than three times higher than the average error of the
-# gradient boosting model. We can suspect that the naive original encoding of
-# the periodic time-related features might prevent the linear regression model
-# to properly leverage the time information: linear regression does not model
-# non-monotonic relationships between the input features and the target.
-# Non-linear terms have to be engineered in the input.
+# gradient boosting model. We can suspect that the naive original encoding
+# (merely min-max scaled) of the periodic time-related features might prevent
+# the linear regression model to properly leverage the time information: linear
+# regression does not automatically model non-monotonic relationships between
+# the input features and the target. Non-linear terms have to be engineered in
+# the input.
 #
 # For example, the raw numerical encoding of the `"hour"` feature prevents the
 # linear model from recognizing that an increase of hour in the morning from 6
 # to 8 should have a strong positive impact on the number of bike rentals while
-# a increase of similar magnitude in the evening from 18 to 20 should have a
+# an increase of similar magnitude in the evening from 18 to 20 should have a
 # strong negative impact on the predicted number of bike rentals.
 #
 # Time-steps as categories
 # ------------------------
 #
 # Since the time features are encoded in a discrete manner using integers (24
 # unique values in the "hours" feature), we could decide to treat those as
-# categorical variables and ignore any assumption implied by the ordering of
-# the hour values using a one-hot encoding.
+# categorical variables using a one-hot encoding and thereby ignore any
+# assumption implied by the ordering of the hour values.
 #
 # Using one-hot encoding for the time features gives the linear model a lot
 # more flexibility as we introduce one additional feature per discrete time
@@ -317,8 +319,8 @@ def evaluate(model, X, y, cv):
 
 # %%
 # The average error rate of this model is 10% which is much better than using
-# the original ordinal encoding of the time feature, confirming our intuition
-# that the linear regression model benefit from the added flexibility to not
+# the original (ordinal) encoding of the time feature, confirming our intuition
+# that the linear regression model benefits from the added flexibility to not
 # treat time progression in a monotonic manner.
 #
 # However, this introduces a very large number of new features. If the time of
@@ -330,7 +332,7 @@ def evaluate(model, X, y, cv):
 # benefitting from the non-monotonic expressivity advantages of one-hot
 # encoding.
 #
-# Finally, we also observe than one-hot encoding completely ignores the
+# Finally, we also observe that one-hot encoding completely ignores the
 # ordering of the hour levels while this could be an interesting inductive bias
 # to preserve to some level. In the following we try to explore smooth,
 # non-monotonic encoding that locally preserves the relative ordering of time
@@ -340,7 +342,7 @@ def evaluate(model, X, y, cv):
 # ----------------------
 #
 # As a first attempt, we can try to encode each of those periodic features
-# using a sine and cosine transform with the matching period.
+# using a sine and cosine transformation with the matching period.
 #
 # Each ordinal time feature is transformed into 2 features that together encode
 # equivalent information in a non-monotonic way, and more importantly without
@@ -375,9 +377,9 @@ def cos_transformer(period):
 #
 # Let's use a 2D scatter plot with the hours encoded as colors to better see
 # how this representation maps the 24 hours of the day to a 2D space, akin to
-# some sort of 24 hour version of an analog clock. Note that the "25th" hour is
-# mapped back to the 1st hour because of the periodic nature of the sine/cosine
-# representation.
+# some sort of a 24 hour version of an analog clock. Note that the "25th" hour
+# is mapped back to the 1st hour because of the periodic nature of the
+# sine/cosine representation.
 fig, ax = plt.subplots(figsize=(7, 5))
 sp = ax.scatter(hour_df["hour_sin"], hour_df["hour_cos"], c=hour_df["hour"])
 ax.set(
@@ -420,7 +422,8 @@ def cos_transformer(period):
 #
 # We can try an alternative encoding of the periodic time-related features
 # using spline transformations with a large enough number of splines, and as a
-# result a larger number of expanded features:
+# result a larger number of expanded features compared to the sine/cosine
+# transformation:
 from sklearn.preprocessing import SplineTransformer
 
 
@@ -485,8 +488,8 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # ~10% of the maximum demand, which is similar to what we observed with the
 # one-hot encoded features.
 #
-# Qualitative analysis of the impact of features on linear models predictions
-# ---------------------------------------------------------------------------
+# Qualitative analysis of the impact of features on linear model predictions
+# --------------------------------------------------------------------------
 #
 # Here, we want to visualize the impact of the feature engineering choices on
 # the time related shape of the predictions.
@@ -539,13 +542,13 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # %%
 # We can draw the following conclusions from the above plot:
 #
-# - the **raw ordinal time-related features** are problematic because they do
+# - The **raw ordinal time-related features** are problematic because they do
 #   not capture the natural periodicity: we observe a big jump in the
 #   predictions at the end of each day when the hour features goes from 23 back
 #   to 0. We can expect similar artifacts at the end of each week or each year.
 #
-# - as expected, the **trigonometric features** (sine and cosine) do not have
-#   these discontinuities at midnight but the linear regression model fails to
+# - As expected, the **trigonometric features** (sine and cosine) do not have
+#   these discontinuities at midnight, but the linear regression model fails to
 #   leverage those features to properly model intra-day variations.
 #   Using trigonometric features for higher harmonics or additional
 #   trigonometric features for the natural period with different phases could
@@ -557,7 +560,7 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 #   `extrapolation="periodic"` option enforces a smooth representation between
 #   `hour=23` and `hour=0`.
 #
-# - the **one-hot encoded features** behave similarly to the periodic
+# - The **one-hot encoded features** behave similarly to the periodic
 #   spline-based features but are more spiky: for instance they can better
 #   model the morning peak during the week days since this peak lasts shorter
 #   than an hour. However, we will see in the following that what can be an
@@ -592,21 +595,21 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # under-estimate the commuting-related events during the working days.
 #
 # These systematic prediction errors reveal a form of under-fitting and can be
-# explained by the lack of non-additive modeling of the interactions between
-# features (in this case "workingday" and features derived from "hours"). This
-# issue will be addressed in the following section.
+# explained by the lack of interactions terms between features, e.g.
+# "workingday" and features derived from "hours". This issue will be addressed
+# in the following section.
 
 # %%
 # Modeling pairwise interactions with splines and polynomial features
 # -------------------------------------------------------------------
 #
-# Linear models alone cannot model interaction effects between input features.
-# It does not help that some features are marginally non-linear as is the case
-# with features constructed by `SplineTransformer` (or one-hot encoding or
-# binning).
+# Linear models do not automatically capture interaction effects between input
+# features. It does not help that some features are marginally non-linear as is
+# the case with features constructed by `SplineTransformer` (or one-hot
+# encoding or binning).
 #
 # However, it is possible to use the `PolynomialFeatures` class on coarse
-# grained splined encoded hours to model the "workingday"/"hours" interaction
+# grained spline encoded hours to model the "workingday"/"hours" interaction
 # explicitly without introducing too many new variables:
 from sklearn.preprocessing import PolynomialFeatures
 from sklearn.pipeline import FeatureUnion
@@ -644,16 +647,16 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 #
 # The previous analysis highlighted the need to model the interactions between
 # `"workingday"` and `"hours"`. Another example of a such a non-linear
-# interactions that we would like to model could be the impact of the rain that
+# interaction that we would like to model could be the impact of the rain that
 # might not be the same during the working days and the week-ends and holidays
 # for instance.
 #
 # To model all such interactions, we could either use a polynomial expansion on
-# all marginal features at once, after their spline-based expansion. However
+# all marginal features at once, after their spline-based expansion. However,
 # this would create a quadratic number of features which can cause overfitting
 # and computational tractability issues.
 #
-# Alternatively we can use the Nyström method to compute an approximate
+# Alternatively, we can use the Nyström method to compute an approximate
 # polynomial kernel expansion. Let us try the latter:
 from sklearn.kernel_approximation import Nystroem
 
@@ -693,11 +696,11 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 
 
 # %%
-# While one-hot features were competitive with spline-based features when using
-# linear models, this is no longer the case when using a low-rank approximation
-# of a non-linear kernel: this can be explained by the fact that spline
-# features are smoother and allow the kernel approximation to find a more
-# expressive decision function.
+# While one-hot encoded features were competitive with spline-based features
+# when using linear models, this is no longer the case when using a low-rank
+# approximation of a non-linear kernel: this can be explained by the fact that
+# spline features are smoother and allow the kernel approximation to find a
+# more expressive decision function.
 #
 # Let us now have a qualitative look at the predictions of the kernel models
 # and of the gradient boosted trees that should be able to better model
@@ -747,13 +750,13 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # since, by default, decision trees are allowed to grow beyond a depth of 2
 # levels.
 #
-# Here we can observe that the combinations of spline features and non-linear
+# Here, we can observe that the combinations of spline features and non-linear
 # kernels works quite well and can almost rival the accuracy of the gradient
 # boosting regression trees.
 #
-# On the contrary, one-hot time features do not perform that well with the low
-# rank kernel model. In particular they significantly over-estimate the low
-# demand hours more than the competing models.
+# On the contrary, one-hot encoded time features do not perform that well with
+# the low rank kernel model. In particular, they significantly over-estimate
+# the low demand hours more than the competing models.
 #
 # We also observe that none of the models can successfully predict some of the
 # peak rentals at the rush hours during the working days. It is possible that
@@ -791,7 +794,7 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # %%
 # This visualization confirms the conclusions we draw on the previous plot.
 #
-# All models under-estimate the high demand events (working days rush hours),
+# All models under-estimate the high demand events (working day rush hours),
 # but gradient boosting a bit less so. The low demand events are well predicted
 # on average by gradient boosting while the one-hot polynomial regression
 # pipeline seems to systematically over-estimate demand in that regime. Overall
@@ -804,9 +807,10 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # We note that we could have obtained slightly better results for kernel models
 # by using more components (higher rank kernel approximation) at the cost of
 # longer fit and prediction durations. For large values of `n_components`, the
-# performance of the one-hot features would even match the spline features.
+# performance of the one-hot encoded features would even match the spline
+# features.
 #
-# The `Nystroem` + `RidgeCV` classifier could also have been replaced by
+# The `Nystroem` + `RidgeCV` regressor could also have been replaced by
 # :class:`~sklearn.neural_network.MLPRegressor` with one or two hidden layers
 # and we would have obtained quite similar results.
 #
@@ -818,7 +822,7 @@ def periodic_spline_transformer(period, n_splines=None, degree=3):
 # flexibility.
 #
 # Finally, in this notebook we used `RidgeCV` because it is very efficient from
-# a computational point of view. However it models the target variable as a
+# a computational point of view. However, it models the target variable as a
 # Gaussian random variable with constant variance. For positive regression
 # problems, it is likely that using a Poisson or Gamma distribution would make
 # more sense. This could be achieved by using