diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index d2a68d351ce8a..854d443b229d0 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -13,7 +13,7 @@ imputation with :class:`~impute.IterativeImputer`: * :class:`~linear_model.BayesianRidge`: regularized linear regression -* :class:`~ensemble.RandomForestRegressor`: Forests of randomized trees regression +* :class:`~ensemble.RandomForestRegressor`: forests of randomized trees regression * :func:`~pipeline.make_pipeline` (:class:`~kernel_approximation.Nystroem`, :class:`~linear_model.Ridge`): a pipeline with the expansion of a degree 2 polynomial kernel and regularized linear regression @@ -62,11 +62,10 @@ from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import RobustScaler N_SPLITS = 5 -rng = np.random.RandomState(0) - X_full, y_full = fetch_california_housing(return_X_y=True) # ~2k samples is enough for the purpose of the example. # Remove the following two lines for a slower run with different error bars. @@ -74,16 +73,28 @@ y_full = y_full[::10] n_samples, n_features = X_full.shape + +def compute_score_for(X, y, imputer=None): + # We scale data before imputation and training a target estimator, + # because our target estimator and some of the imputers assume + # that the features have similar scales. + if imputer is None: + estimator = make_pipeline(RobustScaler(), BayesianRidge()) + else: + estimator = make_pipeline(RobustScaler(), imputer, BayesianRidge()) + return cross_val_score( + estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS + ) + + # Estimate the score on the entire dataset, with no missing values -br_estimator = BayesianRidge() score_full_data = pd.DataFrame( - cross_val_score( - br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS - ), + compute_score_for(X_full, y_full), columns=["Full Data"], ) # Add a single missing value to each row +rng = np.random.RandomState(0) X_missing = X_full.copy() y_missing = y_full missing_samples = np.arange(n_samples) @@ -93,48 +104,52 @@ # Estimate the score after imputation (mean and median strategies) score_simple_imputer = pd.DataFrame() for strategy in ("mean", "median"): - estimator = make_pipeline( - SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator - ) - score_simple_imputer[strategy] = cross_val_score( - estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS + score_simple_imputer[strategy] = compute_score_for( + X_missing, y_missing, SimpleImputer(strategy=strategy) ) # Estimate the score after iterative imputation of the missing values # with different estimators -estimators = [ - BayesianRidge(), - RandomForestRegressor( - # We tuned the hyperparameters of the RandomForestRegressor to get a good - # enough predictive performance for a restricted execution time. - n_estimators=4, - max_depth=10, - bootstrap=True, - max_samples=0.5, - n_jobs=2, - random_state=0, +named_estimators = [ + ("Bayesian Ridge", BayesianRidge()), + ( + "Random Forest", + RandomForestRegressor( + # We tuned the hyperparameters of the RandomForestRegressor to get a good + # enough predictive performance for a restricted execution time. + n_estimators=5, + max_depth=10, + bootstrap=True, + max_samples=0.5, + n_jobs=2, + random_state=0, + ), ), - make_pipeline( - Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3) + ( + "Nystroem + Ridge", + make_pipeline( + Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e4) + ), + ), + ( + "k-NN", + KNeighborsRegressor(n_neighbors=10), ), - KNeighborsRegressor(n_neighbors=15), ] score_iterative_imputer = pd.DataFrame() -# iterative imputer is sensible to the tolerance and +# Iterative imputer is sensitive to the tolerance and # dependent on the estimator used internally. -# we tuned the tolerance to keep this example run with limited computational +# We tuned the tolerance to keep this example run with limited computational # resources while not changing the results too much compared to keeping the # stricter default value for the tolerance parameter. tolerances = (1e-3, 1e-1, 1e-1, 1e-2) -for impute_estimator, tol in zip(estimators, tolerances): - estimator = make_pipeline( +for (name, impute_estimator), tol in zip(named_estimators, tolerances): + score_iterative_imputer[name] = compute_score_for( + X_missing, + y_missing, IterativeImputer( - random_state=0, estimator=impute_estimator, max_iter=25, tol=tol + random_state=0, estimator=impute_estimator, max_iter=40, tol=tol ), - br_estimator, - ) - score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score( - estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS ) scores = pd.concat( diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 851bfd419453b..c7474eb338357 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -9,14 +9,15 @@ In this example we will investigate different imputation techniques: - imputation by the constant value 0 -- imputation by the mean value of each feature combined with a missing-ness - indicator auxiliary variable +- imputation by the mean value of each feature - k nearest neighbor imputation - iterative imputation +In all the cases, for each feature, we add a new feature indicating the missingness. + We will use two datasets: Diabetes dataset which consists of 10 feature variables collected from diabetes patients with an aim to predict disease -progression and California Housing dataset for which the target is the median +progression and California housing dataset for which the target is the median house value for California districts. As neither of these datasets have missing values, we will remove some @@ -36,9 +37,9 @@ # ############################################## # # First we download the two datasets. Diabetes dataset is shipped with -# scikit-learn. It has 442 entries, each with 10 features. California Housing +# scikit-learn. It has 442 entries, each with 10 features. California housing # dataset is much larger with 20640 entries and 8 features. It needs to be -# downloaded. We will only use the first 400 entries for the sake of speeding +# downloaded. We will only use the first 300 entries for the sake of speeding # up the calculations but feel free to use the whole dataset. # @@ -46,17 +47,16 @@ from sklearn.datasets import fetch_california_housing, load_diabetes -rng = np.random.RandomState(42) - X_diabetes, y_diabetes = load_diabetes(return_X_y=True) X_california, y_california = fetch_california_housing(return_X_y=True) -X_california = X_california[:300] -y_california = y_california[:300] + X_diabetes = X_diabetes[:300] y_diabetes = y_diabetes[:300] +X_california = X_california[:300] +y_california = y_california[:300] -def add_missing_values(X_full, y_full): +def add_missing_values(X_full, y_full, rng): n_samples, n_features = X_full.shape # Add missing values in 75% of the lines @@ -75,20 +75,22 @@ def add_missing_values(X_full, y_full): return X_missing, y_missing -X_miss_california, y_miss_california = add_missing_values(X_california, y_california) - -X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes) +rng = np.random.RandomState(42) +X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes, rng) +X_miss_california, y_miss_california = add_missing_values( + X_california, y_california, rng +) # %% # Impute the missing data and score # ################################# # Now we will write a function which will score the results on the differently -# imputed data. Let's look at each imputer separately: +# imputed data, including the case of no imputation for full data. +# We will use :class:`~sklearn.ensemble.RandomForestRegressor` for the target +# regression. # -rng = np.random.RandomState(0) - from sklearn.ensemble import RandomForestRegressor # To use the experimental IterativeImputer, we need to explicitly ask for it: @@ -96,33 +98,29 @@ def add_missing_values(X_full, y_full): from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import RobustScaler N_SPLITS = 4 -regressor = RandomForestRegressor(random_state=0) - -# %% -# Missing information -# ------------------- -# In addition to imputing the missing values, the imputers have an -# `add_indicator` parameter that marks the values that were missing, which -# might carry some information. -# -def get_scores_for_imputer(imputer, X_missing, y_missing): - estimator = make_pipeline(imputer, regressor) - impute_scores = cross_val_score( - estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS +def get_score(X, y, imputer=None): + regressor = RandomForestRegressor(random_state=0) + if imputer is not None: + estimator = make_pipeline(imputer, regressor) + else: + estimator = regressor + scores = cross_val_score( + estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS ) - return impute_scores + return scores.mean(), scores.std() x_labels = [] -mses_california = np.zeros(5) -stds_california = np.zeros(5) mses_diabetes = np.zeros(5) stds_diabetes = np.zeros(5) +mses_california = np.zeros(5) +stds_california = np.zeros(5) # %% # Estimate the score @@ -131,16 +129,9 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): # -def get_full_score(X_full, y_full): - full_scores = cross_val_score( - regressor, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS - ) - return full_scores.mean(), full_scores.std() - - -mses_california[0], stds_california[0] = get_full_score(X_california, y_california) -mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) -x_labels.append("Full data") +mses_diabetes[0], stds_diabetes[0] = get_score(X_diabetes, y_diabetes) +mses_california[0], stds_california[0] = get_score(X_california, y_california) +x_labels.append("Full Data") # %% @@ -151,22 +142,28 @@ def get_full_score(X_full, y_full): # replaced by 0: # +imputer = SimpleImputer(strategy="constant", fill_value=0, add_indicator=True) +mses_diabetes[1], stds_diabetes[1] = get_score( + X_miss_diabetes, y_miss_diabetes, imputer +) +mses_california[1], stds_california[1] = get_score( + X_miss_california, y_miss_california, imputer +) +x_labels.append("Zero Imputation") -def get_impute_zero_score(X_missing, y_missing): - imputer = SimpleImputer( - missing_values=np.nan, add_indicator=True, strategy="constant", fill_value=0 - ) - zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - return zero_impute_scores.mean(), zero_impute_scores.std() - +# %% +# Impute missing values with mean +# ------------------------------- +# -mses_california[1], stds_california[1] = get_impute_zero_score( - X_miss_california, y_miss_california +imputer = SimpleImputer(strategy="mean", add_indicator=True) +mses_diabetes[2], stds_diabetes[2] = get_score( + X_miss_diabetes, y_miss_diabetes, imputer ) -mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score( - X_miss_diabetes, y_miss_diabetes +mses_california[2], stds_california[2] = get_score( + X_miss_california, y_miss_california, imputer ) -x_labels.append("Zero imputation") +x_labels.append("Mean Imputation") # %% @@ -174,74 +171,41 @@ def get_impute_zero_score(X_missing, y_missing): # ------------------------------------ # # :class:`~sklearn.impute.KNNImputer` imputes missing values using the weighted -# or unweighted mean of the desired number of nearest neighbors. - - -def get_impute_knn_score(X_missing, y_missing): - imputer = KNNImputer(missing_values=np.nan, add_indicator=True) - knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - return knn_impute_scores.mean(), knn_impute_scores.std() - +# or unweighted mean of the desired number of nearest neighbors. If your features +# have vastly different scales (as in the California housing dataset), +# consider re-scaling them to potentially improve performance. +# -mses_california[2], stds_california[2] = get_impute_knn_score( - X_miss_california, y_miss_california +imputer = KNNImputer(add_indicator=True) +mses_diabetes[3], stds_diabetes[3] = get_score( + X_miss_diabetes, y_miss_diabetes, imputer ) -mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score( - X_miss_diabetes, y_miss_diabetes +mses_california[3], stds_california[3] = get_score( + X_miss_california, y_miss_california, make_pipeline(RobustScaler(), imputer) ) x_labels.append("KNN Imputation") -# %% -# Impute missing values with mean -# ------------------------------- -# - - -def get_impute_mean(X_missing, y_missing): - imputer = SimpleImputer(missing_values=np.nan, strategy="mean", add_indicator=True) - mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - return mean_impute_scores.mean(), mean_impute_scores.std() - - -mses_california[3], stds_california[3] = get_impute_mean( - X_miss_california, y_miss_california -) -mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, y_miss_diabetes) -x_labels.append("Mean Imputation") - - # %% # Iterative imputation of the missing values # ------------------------------------------ # # Another option is the :class:`~sklearn.impute.IterativeImputer`. This uses -# round-robin linear regression, modeling each feature with missing values as a -# function of other features, in turn. -# The version implemented assumes Gaussian (output) variables. If your features -# are obviously non-normal, consider transforming them to look more normal -# to potentially improve performance. +# round-robin regression, modeling each feature with missing values as a +# function of other features, in turn. We use the class's default choice +# of the regressor model (:class:`~sklearn.linear_model.BayesianRidge`) +# to predict missing feature values. The performance of the predictor +# may be negatively affected by vastly different scales of the features, +# so we re-scale the features in the California housing dataset. # +imputer = IterativeImputer(add_indicator=True) -def get_impute_iterative(X_missing, y_missing): - imputer = IterativeImputer( - missing_values=np.nan, - add_indicator=True, - random_state=0, - n_nearest_features=3, - max_iter=1, - sample_posterior=True, - ) - iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - return iterative_impute_scores.mean(), iterative_impute_scores.std() - - -mses_california[4], stds_california[4] = get_impute_iterative( - X_miss_california, y_miss_california +mses_diabetes[4], stds_diabetes[4] = get_score( + X_miss_diabetes, y_miss_diabetes, imputer ) -mses_diabetes[4], stds_diabetes[4] = get_impute_iterative( - X_miss_diabetes, y_miss_diabetes +mses_california[4], stds_california[4] = get_score( + X_miss_california, y_miss_california, make_pipeline(RobustScaler(), imputer) ) x_labels.append("Iterative Imputation")