From 9cea45ab4bc0d9fbd9ac741d6cbba5146a2e54eb Mon Sep 17 00:00:00 2001 From: ayrat Date: Mon, 14 Apr 2025 14:29:52 +0200 Subject: [PATCH 1/7] scaling data before using k-neighbours regression (fixes: https://github.com/scikit-learn/scikit-learn/issues/31200) --- ...t_iterative_imputer_variants_comparison.py | 27 ++++++++++--------- examples/impute/plot_missing_values.py | 5 ++-- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index f06875a5f7fcd..7fa7d2abf62c5 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -61,6 +61,7 @@ from sklearn.linear_model import BayesianRidge, Ridge from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsRegressor +from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline N_SPLITS = 5 @@ -75,10 +76,10 @@ n_samples, n_features = X_full.shape # Estimate the score on the entire dataset, with no missing values -br_estimator = BayesianRidge() +main_estimator = BayesianRidge() score_full_data = pd.DataFrame( cross_val_score( - br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS + main_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS ), columns=["Full Data"], ) @@ -94,7 +95,7 @@ score_simple_imputer = pd.DataFrame() for strategy in ("mean", "median"): estimator = make_pipeline( - SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator + SimpleImputer(missing_values=np.nan, strategy=strategy), main_estimator ) score_simple_imputer[strategy] = cross_val_score( estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS @@ -102,9 +103,9 @@ # Estimate the score after iterative imputation of the missing values # with different estimators -estimators = [ - BayesianRidge(), - RandomForestRegressor( +named_estimators = [ + ("BayesianRidge", BayesianRidge()), + ("RandomForestRegressor", RandomForestRegressor( # We tuned the hyperparameters of the RandomForestRegressor to get a good # enough predictive performance for a restricted execution time. n_estimators=4, @@ -113,11 +114,11 @@ max_samples=0.5, n_jobs=2, random_state=0, - ), - make_pipeline( + )), + ("Nystroem&Ridge", make_pipeline( Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3) - ), - KNeighborsRegressor(n_neighbors=15), + )), + ("Scaler&KNeighborsRegressor", make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15))) ] score_iterative_imputer = pd.DataFrame() # iterative imputer is sensible to the tolerance and @@ -126,14 +127,14 @@ # resources while not changing the results too much compared to keeping the # stricter default value for the tolerance parameter. tolerances = (1e-3, 1e-1, 1e-1, 1e-2) -for impute_estimator, tol in zip(estimators, tolerances): +for (name, impute_estimator), tol in zip(named_estimators, tolerances): estimator = make_pipeline( IterativeImputer( random_state=0, estimator=impute_estimator, max_iter=25, tol=tol ), - br_estimator, + main_estimator, ) - score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score( + score_iterative_imputer[name] = cross_val_score( estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS ) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 9d61ffc4964ee..8d718bda99eab 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -96,6 +96,7 @@ def add_missing_values(X_full, y_full): from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler N_SPLITS = 4 regressor = RandomForestRegressor(random_state=0) @@ -178,7 +179,7 @@ def get_impute_zero_score(X_missing, y_missing): def get_impute_knn_score(X_missing, y_missing): - imputer = KNNImputer(missing_values=np.nan, add_indicator=True) + imputer = make_pipeline(StandardScaler(), KNNImputer(missing_values=np.nan, add_indicator=True)) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return knn_impute_scores.mean(), knn_impute_scores.std() @@ -189,7 +190,7 @@ def get_impute_knn_score(X_missing, y_missing): mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score( X_miss_diabetes, y_miss_diabetes ) -x_labels.append("KNN Imputation") +x_labels.append("KNN Imputation (scaled)") # %% From ee096270b8f943133fa9bdbd47fbe04b8a54495a Mon Sep 17 00:00:00 2001 From: Ayrat Date: Thu, 24 Apr 2025 17:04:00 +0200 Subject: [PATCH 2/7] Apply suggestions from code review (next: resolving lint issues) Co-authored-by: Olivier Grisel --- .../impute/plot_iterative_imputer_variants_comparison.py | 8 ++++---- examples/impute/plot_missing_values.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 7fa7d2abf62c5..0e00fe0b73b15 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -104,8 +104,8 @@ # Estimate the score after iterative imputation of the missing values # with different estimators named_estimators = [ - ("BayesianRidge", BayesianRidge()), - ("RandomForestRegressor", RandomForestRegressor( + ("Bayesian Ridge", BayesianRidge()), + ("Random Forest", RandomForestRegressor( # We tuned the hyperparameters of the RandomForestRegressor to get a good # enough predictive performance for a restricted execution time. n_estimators=4, @@ -115,10 +115,10 @@ n_jobs=2, random_state=0, )), - ("Nystroem&Ridge", make_pipeline( + ("Nystroem + Ridge", make_pipeline( Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3) )), - ("Scaler&KNeighborsRegressor", make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15))) + ("Scaler + k-NN ", make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15))) ] score_iterative_imputer = pd.DataFrame() # iterative imputer is sensible to the tolerance and diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index 8d718bda99eab..bc0eeaa1b2c72 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -190,7 +190,7 @@ def get_impute_knn_score(X_missing, y_missing): mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score( X_miss_diabetes, y_miss_diabetes ) -x_labels.append("KNN Imputation (scaled)") +x_labels.append("KNN Imputation (scaled features)") # %% From e4fe1d5a4c7890b632fb14a9707c1b273a8debc9 Mon Sep 17 00:00:00 2001 From: ayrat Date: Mon, 28 Apr 2025 16:08:12 +0200 Subject: [PATCH 3/7] fixed lint issues; now ready for merge --- ...t_iterative_imputer_variants_comparison.py | 39 ++++++++++++------- examples/impute/plot_missing_values.py | 4 +- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 0e00fe0b73b15..4a3f9b175e2df 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -61,8 +61,8 @@ from sklearn.linear_model import BayesianRidge, Ridge from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsRegressor -from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler N_SPLITS = 5 @@ -105,20 +105,29 @@ # with different estimators named_estimators = [ ("Bayesian Ridge", BayesianRidge()), - ("Random Forest", RandomForestRegressor( - # We tuned the hyperparameters of the RandomForestRegressor to get a good - # enough predictive performance for a restricted execution time. - n_estimators=4, - max_depth=10, - bootstrap=True, - max_samples=0.5, - n_jobs=2, - random_state=0, - )), - ("Nystroem + Ridge", make_pipeline( - Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3) - )), - ("Scaler + k-NN ", make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15))) + ( + "Random Forest", + RandomForestRegressor( + # We tuned the hyperparameters of the RandomForestRegressor to get a good + # enough predictive performance for a restricted execution time. + n_estimators=4, + max_depth=10, + bootstrap=True, + max_samples=0.5, + n_jobs=2, + random_state=0, + ), + ), + ( + "Nystroem + Ridge", + make_pipeline( + Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3) + ), + ), + ( + "Scaler + k-NN ", + make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15)), + ), ] score_iterative_imputer = pd.DataFrame() # iterative imputer is sensible to the tolerance and diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index bc0eeaa1b2c72..327eb148c84c8 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -179,7 +179,9 @@ def get_impute_zero_score(X_missing, y_missing): def get_impute_knn_score(X_missing, y_missing): - imputer = make_pipeline(StandardScaler(), KNNImputer(missing_values=np.nan, add_indicator=True)) + imputer = make_pipeline( + StandardScaler(), KNNImputer(missing_values=np.nan, add_indicator=True) + ) knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) return knn_impute_scores.mean(), knn_impute_scores.std() From d780e9e7d3cfb44b4f1856dd688c4e69635368f2 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Tue, 20 May 2025 10:14:56 +0200 Subject: [PATCH 4/7] Apply suggestions Co-authored-by: Virgil Chan --- examples/impute/plot_missing_values.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index b073788452fd8..cbe0ddf1e2562 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -192,7 +192,7 @@ def get_impute_knn_score(X_missing, y_missing): mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score( X_miss_diabetes, y_miss_diabetes ) -x_labels.append("KNN Imputation (scaled features)") +x_labels.append("KNN Imputation\n(scaled features)") # %% From 933a1f2d001ecf7baf3aabbe8e5ba4e66d117309 Mon Sep 17 00:00:00 2001 From: ayrat Date: Mon, 26 May 2025 12:29:06 +0200 Subject: [PATCH 5/7] using recommended practices for scaling data; other minor modifications --- ...t_iterative_imputer_variants_comparison.py | 57 +++--- examples/impute/plot_missing_values.py | 170 +++++++----------- 2 files changed, 98 insertions(+), 129 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 2b72bfea758f4..69eb233520b68 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -13,7 +13,7 @@ imputation with :class:`~impute.IterativeImputer`: * :class:`~linear_model.BayesianRidge`: regularized linear regression -* :class:`~ensemble.RandomForestRegressor`: Forests of randomized trees regression +* :class:`~ensemble.RandomForestRegressor`: forests of randomized trees regression * :func:`~pipeline.make_pipeline` (:class:`~kernel_approximation.Nystroem`, :class:`~linear_model.Ridge`): a pipeline with the expansion of a degree 2 polynomial kernel and regularized linear regression @@ -62,7 +62,7 @@ from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import RobustScaler N_SPLITS = 5 @@ -75,12 +75,25 @@ y_full = y_full[::10] n_samples, n_features = X_full.shape +target_estimator = BayesianRidge() + + +def compute_score_for(X, y, imputer): + # We scale data before imputation and training a target estimator, + # because our target estimator and some of the imputers assume + # that the features have similar scales. + if imputer is None: + estimator = make_pipeline(RobustScaler(), target_estimator) + else: + estimator = make_pipeline(RobustScaler(), imputer, target_estimator) + return cross_val_score( + estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS + ) + + # Estimate the score on the entire dataset, with no missing values -main_estimator = BayesianRidge() score_full_data = pd.DataFrame( - cross_val_score( - main_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS - ), + compute_score_for(X_full, y_full, None), columns=["Full Data"], ) @@ -93,13 +106,9 @@ # Estimate the score after imputation (mean and median strategies) score_simple_imputer = pd.DataFrame() -for strategy in ("mean", "median"): - estimator = make_pipeline( - SimpleImputer(missing_values=np.nan, strategy=strategy), main_estimator - ) - score_simple_imputer[strategy] = cross_val_score( - estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS - ) +score_simple_imputer["mean"] = compute_score_for( + X_missing, y_missing, SimpleImputer(strategy="mean") +) # Estimate the score after iterative imputation of the missing values # with different estimators @@ -110,7 +119,7 @@ RandomForestRegressor( # We tuned the hyperparameters of the RandomForestRegressor to get a good # enough predictive performance for a restricted execution time. - n_estimators=4, + n_estimators=5, max_depth=10, bootstrap=True, max_samples=0.5, @@ -121,30 +130,28 @@ ( "Nystroem + Ridge", make_pipeline( - Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3) + Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e4) ), ), ( - "Scaler + k-NN ", - make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15)), + "k-NN", + KNeighborsRegressor(n_neighbors=10), ), ] score_iterative_imputer = pd.DataFrame() -# iterative imputer is sensible to the tolerance and +# Iterative imputer is sensible to the tolerance and # dependent on the estimator used internally. -# we tuned the tolerance to keep this example run with limited computational +# We tuned the tolerance to keep this example run with limited computational # resources while not changing the results too much compared to keeping the # stricter default value for the tolerance parameter. tolerances = (1e-3, 1e-1, 1e-1, 1e-2) for (name, impute_estimator), tol in zip(named_estimators, tolerances): - estimator = make_pipeline( + score_iterative_imputer[name] = compute_score_for( + X_missing, + y_missing, IterativeImputer( - random_state=0, estimator=impute_estimator, max_iter=25, tol=tol + random_state=0, estimator=impute_estimator, max_iter=40, tol=tol ), - main_estimator, - ) - score_iterative_imputer[name] = cross_val_score( - estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS ) scores = pd.concat( diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index cbe0ddf1e2562..e046280b70d82 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -9,14 +9,15 @@ In this example we will investigate different imputation techniques: - imputation by the constant value 0 -- imputation by the mean value of each feature combined with a missing-ness - indicator auxiliary variable +- imputation by the mean value of each feature - k nearest neighbor imputation - iterative imputation +In all the cases, for each feature, we add a new feature indicating the missingness. + We will use two datasets: Diabetes dataset which consists of 10 feature variables collected from diabetes patients with an aim to predict disease -progression and California Housing dataset for which the target is the median +progression and California housing dataset for which the target is the median house value for California districts. As neither of these datasets have missing values, we will remove some @@ -36,9 +37,9 @@ # ############################################## # # First we download the two datasets. Diabetes dataset is shipped with -# scikit-learn. It has 442 entries, each with 10 features. California Housing +# scikit-learn. It has 442 entries, each with 10 features. California housing # dataset is much larger with 20640 entries and 8 features. It needs to be -# downloaded. We will only use the first 400 entries for the sake of speeding +# downloaded. We will only use the first 300 entries for the sake of speeding # up the calculations but feel free to use the whole dataset. # @@ -50,10 +51,11 @@ X_diabetes, y_diabetes = load_diabetes(return_X_y=True) X_california, y_california = fetch_california_housing(return_X_y=True) -X_california = X_california[:300] -y_california = y_california[:300] + X_diabetes = X_diabetes[:300] y_diabetes = y_diabetes[:300] +X_california = X_california[:300] +y_california = y_california[:300] def add_missing_values(X_full, y_full): @@ -75,16 +77,17 @@ def add_missing_values(X_full, y_full): return X_missing, y_missing -X_miss_california, y_miss_california = add_missing_values(X_california, y_california) - X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes) +X_miss_california, y_miss_california = add_missing_values(X_california, y_california) # %% # Impute the missing data and score # ################################# # Now we will write a function which will score the results on the differently -# imputed data. Let's look at each imputer separately: +# imputed data, including the case of no imputation for full data. +# We will use :class:`~sklearn.ensemble.RandomForestRegressor` for the target +# regression. # rng = np.random.RandomState(0) @@ -96,34 +99,29 @@ def add_missing_values(X_full, y_full): from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer from sklearn.model_selection import cross_val_score from sklearn.pipeline import make_pipeline -from sklearn.preprocessing import StandardScaler +from sklearn.preprocessing import RobustScaler N_SPLITS = 4 regressor = RandomForestRegressor(random_state=0) -# %% -# Missing information -# ------------------- -# In addition to imputing the missing values, the imputers have an -# `add_indicator` parameter that marks the values that were missing, which -# might carry some information. -# - -def get_scores_for_imputer(imputer, X_missing, y_missing): - estimator = make_pipeline(imputer, regressor) - impute_scores = cross_val_score( - estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS +def get_score(X, y, imputer): + if imputer is not None: + estimator = make_pipeline(imputer, regressor) + else: + estimator = regressor + scores = cross_val_score( + estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS ) - return impute_scores + return scores.mean(), scores.std() x_labels = [] -mses_california = np.zeros(5) -stds_california = np.zeros(5) mses_diabetes = np.zeros(5) stds_diabetes = np.zeros(5) +mses_california = np.zeros(5) +stds_california = np.zeros(5) # %% # Estimate the score @@ -132,16 +130,9 @@ def get_scores_for_imputer(imputer, X_missing, y_missing): # -def get_full_score(X_full, y_full): - full_scores = cross_val_score( - regressor, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS - ) - return full_scores.mean(), full_scores.std() - - -mses_california[0], stds_california[0] = get_full_score(X_california, y_california) -mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes) -x_labels.append("Full data") +mses_diabetes[0], stds_diabetes[0] = get_score(X_diabetes, y_diabetes, None) +mses_california[0], stds_california[0] = get_score(X_california, y_california, None) +x_labels.append("Full Data") # %% @@ -152,66 +143,48 @@ def get_full_score(X_full, y_full): # replaced by 0: # - -def get_impute_zero_score(X_missing, y_missing): - imputer = SimpleImputer( - missing_values=np.nan, add_indicator=True, strategy="constant", fill_value=0 - ) - zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - return zero_impute_scores.mean(), zero_impute_scores.std() - - -mses_california[1], stds_california[1] = get_impute_zero_score( - X_miss_california, y_miss_california +imputer = SimpleImputer(strategy="constant", fill_value=0, add_indicator=True) +mses_diabetes[1], stds_diabetes[1] = get_score( + X_miss_diabetes, y_miss_diabetes, imputer ) -mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score( - X_miss_diabetes, y_miss_diabetes +mses_california[1], stds_california[1] = get_score( + X_miss_california, y_miss_california, imputer ) -x_labels.append("Zero imputation") - +x_labels.append("Zero Imputation") # %% -# kNN-imputation of the missing values -# ------------------------------------ +# Impute missing values with mean +# ------------------------------- # -# :class:`~sklearn.impute.KNNImputer` imputes missing values using the weighted -# or unweighted mean of the desired number of nearest neighbors. - -def get_impute_knn_score(X_missing, y_missing): - imputer = make_pipeline( - StandardScaler(), KNNImputer(missing_values=np.nan, add_indicator=True) - ) - knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - return knn_impute_scores.mean(), knn_impute_scores.std() - - -mses_california[2], stds_california[2] = get_impute_knn_score( - X_miss_california, y_miss_california +imputer = SimpleImputer(strategy="mean", add_indicator=True) +mses_diabetes[2], stds_diabetes[2] = get_score( + X_miss_diabetes, y_miss_diabetes, imputer ) -mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score( - X_miss_diabetes, y_miss_diabetes +mses_california[2], stds_california[2] = get_score( + X_miss_california, y_miss_california, imputer ) -x_labels.append("KNN Imputation\n(scaled features)") +x_labels.append("Mean Imputation") # %% -# Impute missing values with mean -# ------------------------------- +# kNN-imputation of the missing values +# ------------------------------------ +# +# :class:`~sklearn.impute.KNNImputer` imputes missing values using the weighted +# or unweighted mean of the desired number of nearest neighbors. If your features +# have vastly different scales (as in the California housing dataset), +# consider re-scaling them to potentially improve performance. # - -def get_impute_mean(X_missing, y_missing): - imputer = SimpleImputer(missing_values=np.nan, strategy="mean", add_indicator=True) - mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - return mean_impute_scores.mean(), mean_impute_scores.std() - - -mses_california[3], stds_california[3] = get_impute_mean( - X_miss_california, y_miss_california +imputer = KNNImputer(add_indicator=True) +mses_diabetes[3], stds_diabetes[3] = get_score( + X_miss_diabetes, y_miss_diabetes, imputer ) -mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, y_miss_diabetes) -x_labels.append("Mean Imputation") +mses_california[3], stds_california[3] = get_score( + X_miss_california, y_miss_california, make_pipeline(RobustScaler(), imputer) +) +x_labels.append("KNN Imputation") # %% @@ -219,32 +192,21 @@ def get_impute_mean(X_missing, y_missing): # ------------------------------------------ # # Another option is the :class:`~sklearn.impute.IterativeImputer`. This uses -# round-robin linear regression, modeling each feature with missing values as a -# function of other features, in turn. -# The version implemented assumes Gaussian (output) variables. If your features -# are obviously non-normal, consider transforming them to look more normal -# to potentially improve performance. +# round-robin regression, modeling each feature with missing values as a +# function of other features, in turn. We use the class's default choice +# of the regressor model (:class:`~sklearn.linear_model.BayesianRidge`) +# to predict missing feature values. The performance of the predictor +# may be negatively affected by vastly different scales of the features, +# so we re-scale the features in the California housing dataset. # +imputer = IterativeImputer(add_indicator=True) -def get_impute_iterative(X_missing, y_missing): - imputer = IterativeImputer( - missing_values=np.nan, - add_indicator=True, - random_state=0, - n_nearest_features=3, - max_iter=1, - sample_posterior=True, - ) - iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing) - return iterative_impute_scores.mean(), iterative_impute_scores.std() - - -mses_california[4], stds_california[4] = get_impute_iterative( - X_miss_california, y_miss_california +mses_diabetes[4], stds_diabetes[4] = get_score( + X_miss_diabetes, y_miss_diabetes, imputer ) -mses_diabetes[4], stds_diabetes[4] = get_impute_iterative( - X_miss_diabetes, y_miss_diabetes +mses_california[4], stds_california[4] = get_score( + X_miss_california, y_miss_california, make_pipeline(RobustScaler(), imputer) ) x_labels.append("Iterative Imputation") From be1e95c23642e0ebe82124cd25b6a69918c82da4 Mon Sep 17 00:00:00 2001 From: ayrat Date: Thu, 5 Jun 2025 11:49:48 +0200 Subject: [PATCH 6/7] changes suggested by @betatim; removed unnecessary global variables (target_estimator) and re-initialization of rng --- ...t_iterative_imputer_variants_comparison.py | 20 ++++++++---------- examples/impute/plot_missing_values.py | 21 +++++++++---------- 2 files changed, 19 insertions(+), 22 deletions(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 69eb233520b68..9fafd3d120a6c 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -66,8 +66,6 @@ N_SPLITS = 5 -rng = np.random.RandomState(0) - X_full, y_full = fetch_california_housing(return_X_y=True) # ~2k samples is enough for the purpose of the example. # Remove the following two lines for a slower run with different error bars. @@ -75,17 +73,15 @@ y_full = y_full[::10] n_samples, n_features = X_full.shape -target_estimator = BayesianRidge() - -def compute_score_for(X, y, imputer): +def compute_score_for(X, y, imputer=None): # We scale data before imputation and training a target estimator, # because our target estimator and some of the imputers assume # that the features have similar scales. if imputer is None: - estimator = make_pipeline(RobustScaler(), target_estimator) + estimator = make_pipeline(RobustScaler(), BayesianRidge()) else: - estimator = make_pipeline(RobustScaler(), imputer, target_estimator) + estimator = make_pipeline(RobustScaler(), imputer, BayesianRidge()) return cross_val_score( estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS ) @@ -93,11 +89,12 @@ def compute_score_for(X, y, imputer): # Estimate the score on the entire dataset, with no missing values score_full_data = pd.DataFrame( - compute_score_for(X_full, y_full, None), + compute_score_for(X_full, y_full), columns=["Full Data"], ) # Add a single missing value to each row +rng = np.random.RandomState(0) X_missing = X_full.copy() y_missing = y_full missing_samples = np.arange(n_samples) @@ -106,9 +103,10 @@ def compute_score_for(X, y, imputer): # Estimate the score after imputation (mean and median strategies) score_simple_imputer = pd.DataFrame() -score_simple_imputer["mean"] = compute_score_for( - X_missing, y_missing, SimpleImputer(strategy="mean") -) +for strategy in ("mean", "median"): + score_simple_imputer[strategy] = compute_score_for( + X_missing, y_missing, SimpleImputer(strategy=strategy) + ) # Estimate the score after iterative imputation of the missing values # with different estimators diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py index e046280b70d82..c7474eb338357 100644 --- a/examples/impute/plot_missing_values.py +++ b/examples/impute/plot_missing_values.py @@ -47,8 +47,6 @@ from sklearn.datasets import fetch_california_housing, load_diabetes -rng = np.random.RandomState(42) - X_diabetes, y_diabetes = load_diabetes(return_X_y=True) X_california, y_california = fetch_california_housing(return_X_y=True) @@ -58,7 +56,7 @@ y_california = y_california[:300] -def add_missing_values(X_full, y_full): +def add_missing_values(X_full, y_full, rng): n_samples, n_features = X_full.shape # Add missing values in 75% of the lines @@ -77,8 +75,11 @@ def add_missing_values(X_full, y_full): return X_missing, y_missing -X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes) -X_miss_california, y_miss_california = add_missing_values(X_california, y_california) +rng = np.random.RandomState(42) +X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes, rng) +X_miss_california, y_miss_california = add_missing_values( + X_california, y_california, rng +) # %% @@ -90,8 +91,6 @@ def add_missing_values(X_full, y_full): # regression. # -rng = np.random.RandomState(0) - from sklearn.ensemble import RandomForestRegressor # To use the experimental IterativeImputer, we need to explicitly ask for it: @@ -102,10 +101,10 @@ def add_missing_values(X_full, y_full): from sklearn.preprocessing import RobustScaler N_SPLITS = 4 -regressor = RandomForestRegressor(random_state=0) -def get_score(X, y, imputer): +def get_score(X, y, imputer=None): + regressor = RandomForestRegressor(random_state=0) if imputer is not None: estimator = make_pipeline(imputer, regressor) else: @@ -130,8 +129,8 @@ def get_score(X, y, imputer): # -mses_diabetes[0], stds_diabetes[0] = get_score(X_diabetes, y_diabetes, None) -mses_california[0], stds_california[0] = get_score(X_california, y_california, None) +mses_diabetes[0], stds_diabetes[0] = get_score(X_diabetes, y_diabetes) +mses_california[0], stds_california[0] = get_score(X_california, y_california) x_labels.append("Full Data") From 15acf02aa4487fcc5f875fcc16729172c9714f17 Mon Sep 17 00:00:00 2001 From: Ayrat Date: Thu, 5 Jun 2025 15:05:08 +0200 Subject: [PATCH 7/7] Update examples/impute/plot_iterative_imputer_variants_comparison.py Co-authored-by: Tim Head --- examples/impute/plot_iterative_imputer_variants_comparison.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py index 9fafd3d120a6c..854d443b229d0 100644 --- a/examples/impute/plot_iterative_imputer_variants_comparison.py +++ b/examples/impute/plot_iterative_imputer_variants_comparison.py @@ -137,7 +137,7 @@ def compute_score_for(X, y, imputer=None): ), ] score_iterative_imputer = pd.DataFrame() -# Iterative imputer is sensible to the tolerance and +# Iterative imputer is sensitive to the tolerance and # dependent on the estimator used internally. # We tuned the tolerance to keep this example run with limited computational # resources while not changing the results too much compared to keeping the