From 99c3af4ca9e3c1f4d775d797fa6b5f83387c5513 Mon Sep 17 00:00:00 2001 From: Christian Lorentzen Date: Mon, 16 Aug 2021 21:10:13 +0200 Subject: [PATCH 1/2] EXA improve wording of Time-related feature engineering --- .../plot_cyclical_feature_engineering.py | 108 +++++++++--------- 1 file changed, 56 insertions(+), 52 deletions(-) diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py index 7907d7bda5f23..b41e3e3012e3b 100644 --- a/examples/applications/plot_cyclical_feature_engineering.py +++ b/examples/applications/plot_cyclical_feature_engineering.py @@ -64,8 +64,9 @@ # # When reporting performance measure on the test set in the discussion, we # instead choose to focus on the mean absolute error that is more -# intuitive than the (root) mean squared error. Note however that the best -# models for one metric are also the best for the other in this study. +# intuitive than the (root) mean squared error. Note, however, that the +# best models for one metric are also the best for the other in this +# study. y = df["count"] / 1000 # %% @@ -171,11 +172,11 @@ # let the model know that it should treat those as categorical variables by # using a dedicated tree splitting rule. Since we use an ordinal encoder, we # pass the list of categorical values explicitly to use a logical order when -# encoding the categories as integer instead of the lexicographical order. This -# also has the added benefit of preventing any issue with unknown categories -# when using cross-validation. +# encoding the categories as integers instead of the lexicographical order. +# This also has the added benefit of preventing any issue with unknown +# categories when using cross-validation. # -# The numerical variable need no preprocessing and, for the sake of simplicity, +# The numerical variables need no preprocessing and, for the sake of simplicity, # we only try the default hyper-parameters for this model: from sklearn.pipeline import make_pipeline from sklearn.preprocessing import OrdinalEncoder @@ -243,7 +244,7 @@ def evaluate(model, X, y, cv): # of a problem for tree-based models as they can learn a non-monotonic # relationship between ordinal input features and the target. # -# This is not the case for linear regression model as we will see in the +# This is not the case for linear regression models as we will see in the # following. # # Naive linear regression @@ -279,16 +280,17 @@ def evaluate(model, X, y, cv): # # The performance is not good: the average error is around 14% of the maximum # demand. This is more than three times higher than the average error of the -# gradient boosting model. We can suspect that the naive original encoding of -# the periodic time-related features might prevent the linear regression model -# to properly leverage the time information: linear regression does not model -# non-monotonic relationships between the input features and the target. -# Non-linear terms have to be engineered in the input. +# gradient boosting model. We can suspect that the naive original encoding +# (merely min-max scaled) of the periodic time-related features might prevent +# the linear regression model to properly leverage the time information: linear +# regression does not automatically model non-monotonic relationships between +# the input features and the target. Non-linear terms have to be engineered in +# the input. # # For example, the raw numerical encoding of the `"hour"` feature prevents the # linear model from recognizing that an increase of hour in the morning from 6 # to 8 should have a strong positive impact on the number of bike rentals while -# a increase of similar magnitude in the evening from 18 to 20 should have a +# an increase of similar magnitude in the evening from 18 to 20 should have a # strong negative impact on the predicted number of bike rentals. # # Time-steps as categories @@ -296,8 +298,8 @@ def evaluate(model, X, y, cv): # # Since the time features are encoded in a discrete manner using integers (24 # unique values in the "hours" feature), we could decide to treat those as -# categorical variables and ignore any assumption implied by the ordering of -# the hour values using a one-hot encoding. +# categorical variables using a one-hot encoding and thereby ignore any +# assumption implied by the ordering of the hour values. # # Using one-hot encoding for the time features gives the linear model a lot # more flexibility as we introduce one additional feature per discrete time @@ -317,8 +319,8 @@ def evaluate(model, X, y, cv): # %% # The average error rate of this model is 10% which is much better than using -# the original ordinal encoding of the time feature, confirming our intuition -# that the linear regression model benefit from the added flexibility to not +# the original (ordinal) encoding of the time feature, confirming our intuition +# that the linear regression model benefits from the added flexibility to not # treat time progression in a monotonic manner. # # However, this introduces a very large number of new features. If the time of @@ -330,7 +332,7 @@ def evaluate(model, X, y, cv): # benefitting from the non-monotonic expressivity advantages of one-hot # encoding. # -# Finally, we also observe than one-hot encoding completely ignores the +# Finally, we also observe that one-hot encoding completely ignores the # ordering of the hour levels while this could be an interesting inductive bias # to preserve to some level. In the following we try to explore smooth, # non-monotonic encoding that locally preserves the relative ordering of time @@ -340,7 +342,7 @@ def evaluate(model, X, y, cv): # ---------------------- # # As a first attempt, we can try to encode each of those periodic features -# using a sine and cosine transform with the matching period. +# using a sine and cosine transformation with the matching period. # # Each ordinal time feature is transformed into 2 features that together encode # equivalent information in a non-monotonic way, and more importantly without @@ -375,9 +377,9 @@ def cos_transformer(period): # # Let's use a 2D scatter plot with the hours encoded as colors to better see # how this representation maps the 24 hours of the day to a 2D space, akin to -# some sort of 24 hour version of an analog clock. Note that the "25th" hour is -# mapped back to the 1st hour because of the periodic nature of the sine/cosine -# representation. +# some sort of a 24 hour version of an analog clock. Note that the "25th" hour +# is mapped back to the 1st hour because of the periodic nature of the +# sine/cosine representation. fig, ax = plt.subplots(figsize=(7, 5)) sp = ax.scatter(hour_df["hour_sin"], hour_df["hour_cos"], c=hour_df["hour"]) ax.set( @@ -420,7 +422,8 @@ def cos_transformer(period): # # We can try an alternative encoding of the periodic time-related features # using spline transformations with a large enough number of splines, and as a -# result a larger number of expanded features: +# result a larger number of expanded features compared to the sine/cosine +# transformation: from sklearn.preprocessing import SplineTransformer @@ -485,7 +488,7 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # ~10% of the maximum demand, which is similar to what we observed with the # one-hot encoded features. # -# Qualitative analysis of the impact of features on linear models predictions +# Qualitative analysis of the impact of features on linear model predictions # --------------------------------------------------------------------------- # # Here, we want to visualize the impact of the feature engineering choices on @@ -539,13 +542,13 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # %% # We can draw the following conclusions from the above plot: # -# - the **raw ordinal time-related features** are problematic because they do +# - The **raw ordinal time-related features** are problematic because they do # not capture the natural periodicity: we observe a big jump in the # predictions at the end of each day when the hour features goes from 23 back # to 0. We can expect similar artifacts at the end of each week or each year. # -# - as expected, the **trigonometric features** (sine and cosine) do not have -# these discontinuities at midnight but the linear regression model fails to +# - As expected, the **trigonometric features** (sine and cosine) do not have +# these discontinuities at midnight, but the linear regression model fails to # leverage those features to properly model intra-day variations. # Using trigonometric features for higher harmonics or additional # trigonometric features for the natural period with different phases could @@ -557,7 +560,7 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # `extrapolation="periodic"` option enforces a smooth representation between # `hour=23` and `hour=0`. # -# - the **one-hot encoded features** behave similarly to the periodic +# - The **one-hot encoded features** behave similarly to the periodic # spline-based features but are more spiky: for instance they can better # model the morning peak during the week days since this peak lasts shorter # than an hour. However, we will see in the following that what can be an @@ -592,21 +595,21 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # under-estimate the commuting-related events during the working days. # # These systematic prediction errors reveal a form of under-fitting and can be -# explained by the lack of non-additive modeling of the interactions between -# features (in this case "workingday" and features derived from "hours"). This -# issue will be addressed in the following section. +# explained by the lack of interactions terms between features, e.g. +# "workingday" and features derived from "hours". This issue will be addressed +# in the following section. # %% # Modeling pairwise interactions with splines and polynomial features # ------------------------------------------------------------------- # -# Linear models alone cannot model interaction effects between input features. -# It does not help that some features are marginally non-linear as is the case -# with features constructed by `SplineTransformer` (or one-hot encoding or -# binning). +# Linear models do not automatically capture interaction effects between input +# features. It does not help that some features are marginally non-linear as is +# the case with features constructed by `SplineTransformer` (or one-hot +# encoding or binning). # # However, it is possible to use the `PolynomialFeatures` class on coarse -# grained splined encoded hours to model the "workingday"/"hours" interaction +# grained spline encoded hours to model the "workingday"/"hours" interaction # explicitly without introducing too many new variables: from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import FeatureUnion @@ -644,16 +647,16 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # # The previous analysis highlighted the need to model the interactions between # `"workingday"` and `"hours"`. Another example of a such a non-linear -# interactions that we would like to model could be the impact of the rain that +# interaction that we would like to model could be the impact of the rain that # might not be the same during the working days and the week-ends and holidays # for instance. # # To model all such interactions, we could either use a polynomial expansion on -# all marginal features at once, after their spline-based expansion. However +# all marginal features at once, after their spline-based expansion. However, # this would create a quadratic number of features which can cause overfitting # and computational tractability issues. # -# Alternatively we can use the Nyström method to compute an approximate +# Alternatively, we can use the Nyström method to compute an approximate # polynomial kernel expansion. Let us try the latter: from sklearn.kernel_approximation import Nystroem @@ -693,11 +696,11 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # %% -# While one-hot features were competitive with spline-based features when using -# linear models, this is no longer the case when using a low-rank approximation -# of a non-linear kernel: this can be explained by the fact that spline -# features are smoother and allow the kernel approximation to find a more -# expressive decision function. +# While one-hot encoded features were competitive with spline-based features +# when using linear models, this is no longer the case when using a low-rank +# approximation of a non-linear kernel: this can be explained by the fact that +# spline features are smoother and allow the kernel approximation to find a +# more expressive decision function. # # Let us now have a qualitative look at the predictions of the kernel models # and of the gradient boosted trees that should be able to better model @@ -747,13 +750,13 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # since, by default, decision trees are allowed to grow beyond a depth of 2 # levels. # -# Here we can observe that the combinations of spline features and non-linear +# Here, we can observe that the combinations of spline features and non-linear # kernels works quite well and can almost rival the accuracy of the gradient # boosting regression trees. # -# On the contrary, one-hot time features do not perform that well with the low -# rank kernel model. In particular they significantly over-estimate the low -# demand hours more than the competing models. +# On the contrary, one-hot encoded time features do not perform that well with +# the low rank kernel model. In particular, they significantly over-estimate +# the low demand hours more than the competing models. # # We also observe that none of the models can successfully predict some of the # peak rentals at the rush hours during the working days. It is possible that @@ -791,7 +794,7 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # %% # This visualization confirms the conclusions we draw on the previous plot. # -# All models under-estimate the high demand events (working days rush hours), +# All models under-estimate the high demand events (working day rush hours), # but gradient boosting a bit less so. The low demand events are well predicted # on average by gradient boosting while the one-hot polynomial regression # pipeline seems to systematically over-estimate demand in that regime. Overall @@ -804,9 +807,10 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # We note that we could have obtained slightly better results for kernel models # by using more components (higher rank kernel approximation) at the cost of # longer fit and prediction durations. For large values of `n_components`, the -# performance of the one-hot features would even match the spline features. +# performance of the one-hot encoded features would even match the spline +# features. # -# The `Nystroem` + `RidgeCV` classifier could also have been replaced by +# The `Nystroem` + `RidgeCV` regressor could also have been replaced by # :class:`~sklearn.neural_network.MLPRegressor` with one or two hidden layers # and we would have obtained quite similar results. # @@ -818,7 +822,7 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # flexibility. # # Finally, in this notebook we used `RidgeCV` because it is very efficient from -# a computational point of view. However it models the target variable as a +# a computational point of view. However, it models the target variable as a # Gaussian random variable with constant variance. For positive regression # problems, it is likely that using a Poisson or Gamma distribution would make # more sense. This could be achieved by using From d7f5ac41e38d3df57b2b1b0bc523eb43d520b619 Mon Sep 17 00:00:00 2001 From: Olivier Grisel Date: Tue, 17 Aug 2021 14:52:03 +0200 Subject: [PATCH 2/2] Update examples/applications/plot_cyclical_feature_engineering.py --- examples/applications/plot_cyclical_feature_engineering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py index b41e3e3012e3b..b28df2ba88a53 100644 --- a/examples/applications/plot_cyclical_feature_engineering.py +++ b/examples/applications/plot_cyclical_feature_engineering.py @@ -489,7 +489,7 @@ def periodic_spline_transformer(period, n_splines=None, degree=3): # one-hot encoded features. # # Qualitative analysis of the impact of features on linear model predictions -# --------------------------------------------------------------------------- +# -------------------------------------------------------------------------- # # Here, we want to visualize the impact of the feature engineering choices on # the time related shape of the predictions.