From 9cea45ab4bc0d9fbd9ac741d6cbba5146a2e54eb Mon Sep 17 00:00:00 2001
From: ayrat <ayrat.khalimov@gmail.com>
Date: Mon, 14 Apr 2025 14:29:52 +0200
Subject: [PATCH 1/7] scaling data before using k-neighbours regression (fixes:
 https://github.com/scikit-learn/scikit-learn/issues/31200)

---
 ...t_iterative_imputer_variants_comparison.py | 27 ++++++++++---------
 examples/impute/plot_missing_values.py        |  5 ++--
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index f06875a5f7fcd..7fa7d2abf62c5 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -61,6 +61,7 @@
 from sklearn.linear_model import BayesianRidge, Ridge
 from sklearn.model_selection import cross_val_score
 from sklearn.neighbors import KNeighborsRegressor
+from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import make_pipeline
 
 N_SPLITS = 5
@@ -75,10 +76,10 @@
 n_samples, n_features = X_full.shape
 
 # Estimate the score on the entire dataset, with no missing values
-br_estimator = BayesianRidge()
+main_estimator = BayesianRidge()
 score_full_data = pd.DataFrame(
     cross_val_score(
-        br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
+        main_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
     ),
     columns=["Full Data"],
 )
@@ -94,7 +95,7 @@
 score_simple_imputer = pd.DataFrame()
 for strategy in ("mean", "median"):
     estimator = make_pipeline(
-        SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
+        SimpleImputer(missing_values=np.nan, strategy=strategy), main_estimator
     )
     score_simple_imputer[strategy] = cross_val_score(
         estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
@@ -102,9 +103,9 @@
 
 # Estimate the score after iterative imputation of the missing values
 # with different estimators
-estimators = [
-    BayesianRidge(),
-    RandomForestRegressor(
+named_estimators = [
+    ("BayesianRidge", BayesianRidge()),
+    ("RandomForestRegressor", RandomForestRegressor(
         # We tuned the hyperparameters of the RandomForestRegressor to get a good
         # enough predictive performance for a restricted execution time.
         n_estimators=4,
@@ -113,11 +114,11 @@
         max_samples=0.5,
         n_jobs=2,
         random_state=0,
-    ),
-    make_pipeline(
+     )),
+    ("Nystroem&Ridge", make_pipeline(
         Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
-    ),
-    KNeighborsRegressor(n_neighbors=15),
+    )),
+    ("Scaler&KNeighborsRegressor", make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15)))
 ]
 score_iterative_imputer = pd.DataFrame()
 # iterative imputer is sensible to the tolerance and
@@ -126,14 +127,14 @@
 # resources while not changing the results too much compared to keeping the
 # stricter default value for the tolerance parameter.
 tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
-for impute_estimator, tol in zip(estimators, tolerances):
+for (name, impute_estimator), tol in zip(named_estimators, tolerances):
     estimator = make_pipeline(
         IterativeImputer(
             random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
         ),
-        br_estimator,
+        main_estimator,
     )
-    score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
+    score_iterative_imputer[name] = cross_val_score(
         estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
     )
 
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 9d61ffc4964ee..8d718bda99eab 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -96,6 +96,7 @@ def add_missing_values(X_full, y_full):
 from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
 
 N_SPLITS = 4
 regressor = RandomForestRegressor(random_state=0)
@@ -178,7 +179,7 @@ def get_impute_zero_score(X_missing, y_missing):
 
 
 def get_impute_knn_score(X_missing, y_missing):
-    imputer = KNNImputer(missing_values=np.nan, add_indicator=True)
+    imputer = make_pipeline(StandardScaler(), KNNImputer(missing_values=np.nan, add_indicator=True))
     knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
     return knn_impute_scores.mean(), knn_impute_scores.std()
 
@@ -189,7 +190,7 @@ def get_impute_knn_score(X_missing, y_missing):
 mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(
     X_miss_diabetes, y_miss_diabetes
 )
-x_labels.append("KNN Imputation")
+x_labels.append("KNN Imputation (scaled)")
 
 
 # %%

From ee096270b8f943133fa9bdbd47fbe04b8a54495a Mon Sep 17 00:00:00 2001
From: Ayrat <ayrat.khalimov@gmail.com>
Date: Thu, 24 Apr 2025 17:04:00 +0200
Subject: [PATCH 2/7] Apply suggestions from code review (next: resolving lint
 issues)

Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
---
 .../impute/plot_iterative_imputer_variants_comparison.py  | 8 ++++----
 examples/impute/plot_missing_values.py                    | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 7fa7d2abf62c5..0e00fe0b73b15 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -104,8 +104,8 @@
 # Estimate the score after iterative imputation of the missing values
 # with different estimators
 named_estimators = [
-    ("BayesianRidge", BayesianRidge()),
-    ("RandomForestRegressor", RandomForestRegressor(
+    ("Bayesian Ridge", BayesianRidge()),
+    ("Random Forest", RandomForestRegressor(
         # We tuned the hyperparameters of the RandomForestRegressor to get a good
         # enough predictive performance for a restricted execution time.
         n_estimators=4,
@@ -115,10 +115,10 @@
         n_jobs=2,
         random_state=0,
      )),
-    ("Nystroem&Ridge", make_pipeline(
+    ("Nystroem + Ridge", make_pipeline(
         Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
     )),
-    ("Scaler&KNeighborsRegressor", make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15)))
+    ("Scaler + k-NN ", make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15)))
 ]
 score_iterative_imputer = pd.DataFrame()
 # iterative imputer is sensible to the tolerance and
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index 8d718bda99eab..bc0eeaa1b2c72 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -190,7 +190,7 @@ def get_impute_knn_score(X_missing, y_missing):
 mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(
     X_miss_diabetes, y_miss_diabetes
 )
-x_labels.append("KNN Imputation (scaled)")
+x_labels.append("KNN Imputation (scaled features)")
 
 
 # %%

From e4fe1d5a4c7890b632fb14a9707c1b273a8debc9 Mon Sep 17 00:00:00 2001
From: ayrat <ayrat.khalimov@gmail.com>
Date: Mon, 28 Apr 2025 16:08:12 +0200
Subject: [PATCH 3/7] fixed lint issues; now ready for merge

---
 ...t_iterative_imputer_variants_comparison.py | 39 ++++++++++++-------
 examples/impute/plot_missing_values.py        |  4 +-
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 0e00fe0b73b15..4a3f9b175e2df 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -61,8 +61,8 @@
 from sklearn.linear_model import BayesianRidge, Ridge
 from sklearn.model_selection import cross_val_score
 from sklearn.neighbors import KNeighborsRegressor
-from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import StandardScaler
 
 N_SPLITS = 5
 
@@ -105,20 +105,29 @@
 # with different estimators
 named_estimators = [
     ("Bayesian Ridge", BayesianRidge()),
-    ("Random Forest", RandomForestRegressor(
-        # We tuned the hyperparameters of the RandomForestRegressor to get a good
-        # enough predictive performance for a restricted execution time.
-        n_estimators=4,
-        max_depth=10,
-        bootstrap=True,
-        max_samples=0.5,
-        n_jobs=2,
-        random_state=0,
-     )),
-    ("Nystroem + Ridge", make_pipeline(
-        Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
-    )),
-    ("Scaler + k-NN ", make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15)))
+    (
+        "Random Forest",
+        RandomForestRegressor(
+            # We tuned the hyperparameters of the RandomForestRegressor to get a good
+            # enough predictive performance for a restricted execution time.
+            n_estimators=4,
+            max_depth=10,
+            bootstrap=True,
+            max_samples=0.5,
+            n_jobs=2,
+            random_state=0,
+        ),
+    ),
+    (
+        "Nystroem + Ridge",
+        make_pipeline(
+            Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
+        ),
+    ),
+    (
+        "Scaler + k-NN ",
+        make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15)),
+    ),
 ]
 score_iterative_imputer = pd.DataFrame()
 # iterative imputer is sensible to the tolerance and
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index bc0eeaa1b2c72..327eb148c84c8 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -179,7 +179,9 @@ def get_impute_zero_score(X_missing, y_missing):
 
 
 def get_impute_knn_score(X_missing, y_missing):
-    imputer = make_pipeline(StandardScaler(), KNNImputer(missing_values=np.nan, add_indicator=True))
+    imputer = make_pipeline(
+        StandardScaler(), KNNImputer(missing_values=np.nan, add_indicator=True)
+    )
     knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
     return knn_impute_scores.mean(), knn_impute_scores.std()
 

From d780e9e7d3cfb44b4f1856dd688c4e69635368f2 Mon Sep 17 00:00:00 2001
From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com>
Date: Tue, 20 May 2025 10:14:56 +0200
Subject: [PATCH 4/7] Apply suggestions

Co-authored-by: Virgil Chan <virchan.math@gmail.com>
---
 examples/impute/plot_missing_values.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index b073788452fd8..cbe0ddf1e2562 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -192,7 +192,7 @@ def get_impute_knn_score(X_missing, y_missing):
 mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(
     X_miss_diabetes, y_miss_diabetes
 )
-x_labels.append("KNN Imputation (scaled features)")
+x_labels.append("KNN Imputation\n(scaled features)")
 
 
 # %%

From 933a1f2d001ecf7baf3aabbe8e5ba4e66d117309 Mon Sep 17 00:00:00 2001
From: ayrat <ayrat.khalimov@gmail.com>
Date: Mon, 26 May 2025 12:29:06 +0200
Subject: [PATCH 5/7] using recommended practices for scaling data; other minor
 modifications

---
 ...t_iterative_imputer_variants_comparison.py |  57 +++---
 examples/impute/plot_missing_values.py        | 170 +++++++-----------
 2 files changed, 98 insertions(+), 129 deletions(-)

diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 2b72bfea758f4..69eb233520b68 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -13,7 +13,7 @@
 imputation with :class:`~impute.IterativeImputer`:
 
 * :class:`~linear_model.BayesianRidge`: regularized linear regression
-* :class:`~ensemble.RandomForestRegressor`: Forests of randomized trees regression
+* :class:`~ensemble.RandomForestRegressor`: forests of randomized trees regression
 * :func:`~pipeline.make_pipeline` (:class:`~kernel_approximation.Nystroem`,
   :class:`~linear_model.Ridge`): a pipeline with the expansion of a degree 2
   polynomial kernel and regularized linear regression
@@ -62,7 +62,7 @@
 from sklearn.model_selection import cross_val_score
 from sklearn.neighbors import KNeighborsRegressor
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import RobustScaler
 
 N_SPLITS = 5
 
@@ -75,12 +75,25 @@
 y_full = y_full[::10]
 n_samples, n_features = X_full.shape
 
+target_estimator = BayesianRidge()
+
+
+def compute_score_for(X, y, imputer):
+    # We scale data before imputation and training a target estimator,
+    # because our target estimator and some of the imputers assume
+    # that the features have similar scales.
+    if imputer is None:
+        estimator = make_pipeline(RobustScaler(), target_estimator)
+    else:
+        estimator = make_pipeline(RobustScaler(), imputer, target_estimator)
+    return cross_val_score(
+        estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS
+    )
+
+
 # Estimate the score on the entire dataset, with no missing values
-main_estimator = BayesianRidge()
 score_full_data = pd.DataFrame(
-    cross_val_score(
-        main_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
-    ),
+    compute_score_for(X_full, y_full, None),
     columns=["Full Data"],
 )
 
@@ -93,13 +106,9 @@
 
 # Estimate the score after imputation (mean and median strategies)
 score_simple_imputer = pd.DataFrame()
-for strategy in ("mean", "median"):
-    estimator = make_pipeline(
-        SimpleImputer(missing_values=np.nan, strategy=strategy), main_estimator
-    )
-    score_simple_imputer[strategy] = cross_val_score(
-        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
-    )
+score_simple_imputer["mean"] = compute_score_for(
+    X_missing, y_missing, SimpleImputer(strategy="mean")
+)
 
 # Estimate the score after iterative imputation of the missing values
 # with different estimators
@@ -110,7 +119,7 @@
         RandomForestRegressor(
             # We tuned the hyperparameters of the RandomForestRegressor to get a good
             # enough predictive performance for a restricted execution time.
-            n_estimators=4,
+            n_estimators=5,
             max_depth=10,
             bootstrap=True,
             max_samples=0.5,
@@ -121,30 +130,28 @@
     (
         "Nystroem + Ridge",
         make_pipeline(
-            Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
+            Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e4)
         ),
     ),
     (
-        "Scaler + k-NN ",
-        make_pipeline(StandardScaler(), KNeighborsRegressor(n_neighbors=15)),
+        "k-NN",
+        KNeighborsRegressor(n_neighbors=10),
     ),
 ]
 score_iterative_imputer = pd.DataFrame()
-# iterative imputer is sensible to the tolerance and
+# Iterative imputer is sensible to the tolerance and
 # dependent on the estimator used internally.
-# we tuned the tolerance to keep this example run with limited computational
+# We tuned the tolerance to keep this example run with limited computational
 # resources while not changing the results too much compared to keeping the
 # stricter default value for the tolerance parameter.
 tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
 for (name, impute_estimator), tol in zip(named_estimators, tolerances):
-    estimator = make_pipeline(
+    score_iterative_imputer[name] = compute_score_for(
+        X_missing,
+        y_missing,
         IterativeImputer(
-            random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
+            random_state=0, estimator=impute_estimator, max_iter=40, tol=tol
         ),
-        main_estimator,
-    )
-    score_iterative_imputer[name] = cross_val_score(
-        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
     )
 
 scores = pd.concat(
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index cbe0ddf1e2562..e046280b70d82 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -9,14 +9,15 @@
 In this example we will investigate different imputation techniques:
 
 - imputation by the constant value 0
-- imputation by the mean value of each feature combined with a missing-ness
-  indicator auxiliary variable
+- imputation by the mean value of each feature
 - k nearest neighbor imputation
 - iterative imputation
 
+In all the cases, for each feature, we add a new feature indicating the missingness.
+
 We will use two datasets: Diabetes dataset which consists of 10 feature
 variables collected from diabetes patients with an aim to predict disease
-progression and California Housing dataset for which the target is the median
+progression and California housing dataset for which the target is the median
 house value for California districts.
 
 As neither of these datasets have missing values, we will remove some
@@ -36,9 +37,9 @@
 # ##############################################
 #
 # First we download the two datasets. Diabetes dataset is shipped with
-# scikit-learn. It has 442 entries, each with 10 features. California Housing
+# scikit-learn. It has 442 entries, each with 10 features. California housing
 # dataset is much larger with 20640 entries and 8 features. It needs to be
-# downloaded. We will only use the first 400 entries for the sake of speeding
+# downloaded. We will only use the first 300 entries for the sake of speeding
 # up the calculations but feel free to use the whole dataset.
 #
 
@@ -50,10 +51,11 @@
 
 X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
 X_california, y_california = fetch_california_housing(return_X_y=True)
-X_california = X_california[:300]
-y_california = y_california[:300]
+
 X_diabetes = X_diabetes[:300]
 y_diabetes = y_diabetes[:300]
+X_california = X_california[:300]
+y_california = y_california[:300]
 
 
 def add_missing_values(X_full, y_full):
@@ -75,16 +77,17 @@ def add_missing_values(X_full, y_full):
     return X_missing, y_missing
 
 
-X_miss_california, y_miss_california = add_missing_values(X_california, y_california)
-
 X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes)
+X_miss_california, y_miss_california = add_missing_values(X_california, y_california)
 
 
 # %%
 # Impute the missing data and score
 # #################################
 # Now we will write a function which will score the results on the differently
-# imputed data. Let's look at each imputer separately:
+# imputed data, including the case of no imputation for full data.
+# We will use :class:`~sklearn.ensemble.RandomForestRegressor` for the target
+# regression.
 #
 
 rng = np.random.RandomState(0)
@@ -96,34 +99,29 @@ def add_missing_values(X_full, y_full):
 from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
 from sklearn.model_selection import cross_val_score
 from sklearn.pipeline import make_pipeline
-from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import RobustScaler
 
 N_SPLITS = 4
 regressor = RandomForestRegressor(random_state=0)
 
-# %%
-# Missing information
-# -------------------
-# In addition to imputing the missing values, the imputers have an
-# `add_indicator` parameter that marks the values that were missing, which
-# might carry some information.
-#
-
 
-def get_scores_for_imputer(imputer, X_missing, y_missing):
-    estimator = make_pipeline(imputer, regressor)
-    impute_scores = cross_val_score(
-        estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
+def get_score(X, y, imputer):
+    if imputer is not None:
+        estimator = make_pipeline(imputer, regressor)
+    else:
+        estimator = regressor
+    scores = cross_val_score(
+        estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS
     )
-    return impute_scores
+    return scores.mean(), scores.std()
 
 
 x_labels = []
 
-mses_california = np.zeros(5)
-stds_california = np.zeros(5)
 mses_diabetes = np.zeros(5)
 stds_diabetes = np.zeros(5)
+mses_california = np.zeros(5)
+stds_california = np.zeros(5)
 
 # %%
 # Estimate the score
@@ -132,16 +130,9 @@ def get_scores_for_imputer(imputer, X_missing, y_missing):
 #
 
 
-def get_full_score(X_full, y_full):
-    full_scores = cross_val_score(
-        regressor, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
-    )
-    return full_scores.mean(), full_scores.std()
-
-
-mses_california[0], stds_california[0] = get_full_score(X_california, y_california)
-mses_diabetes[0], stds_diabetes[0] = get_full_score(X_diabetes, y_diabetes)
-x_labels.append("Full data")
+mses_diabetes[0], stds_diabetes[0] = get_score(X_diabetes, y_diabetes, None)
+mses_california[0], stds_california[0] = get_score(X_california, y_california, None)
+x_labels.append("Full Data")
 
 
 # %%
@@ -152,66 +143,48 @@ def get_full_score(X_full, y_full):
 # replaced by 0:
 #
 
-
-def get_impute_zero_score(X_missing, y_missing):
-    imputer = SimpleImputer(
-        missing_values=np.nan, add_indicator=True, strategy="constant", fill_value=0
-    )
-    zero_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
-    return zero_impute_scores.mean(), zero_impute_scores.std()
-
-
-mses_california[1], stds_california[1] = get_impute_zero_score(
-    X_miss_california, y_miss_california
+imputer = SimpleImputer(strategy="constant", fill_value=0, add_indicator=True)
+mses_diabetes[1], stds_diabetes[1] = get_score(
+    X_miss_diabetes, y_miss_diabetes, imputer
 )
-mses_diabetes[1], stds_diabetes[1] = get_impute_zero_score(
-    X_miss_diabetes, y_miss_diabetes
+mses_california[1], stds_california[1] = get_score(
+    X_miss_california, y_miss_california, imputer
 )
-x_labels.append("Zero imputation")
-
+x_labels.append("Zero Imputation")
 
 # %%
-# kNN-imputation of the missing values
-# ------------------------------------
+# Impute missing values with mean
+# -------------------------------
 #
-# :class:`~sklearn.impute.KNNImputer` imputes missing values using the weighted
-# or unweighted mean of the desired number of nearest neighbors.
-
 
-def get_impute_knn_score(X_missing, y_missing):
-    imputer = make_pipeline(
-        StandardScaler(), KNNImputer(missing_values=np.nan, add_indicator=True)
-    )
-    knn_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
-    return knn_impute_scores.mean(), knn_impute_scores.std()
-
-
-mses_california[2], stds_california[2] = get_impute_knn_score(
-    X_miss_california, y_miss_california
+imputer = SimpleImputer(strategy="mean", add_indicator=True)
+mses_diabetes[2], stds_diabetes[2] = get_score(
+    X_miss_diabetes, y_miss_diabetes, imputer
 )
-mses_diabetes[2], stds_diabetes[2] = get_impute_knn_score(
-    X_miss_diabetes, y_miss_diabetes
+mses_california[2], stds_california[2] = get_score(
+    X_miss_california, y_miss_california, imputer
 )
-x_labels.append("KNN Imputation\n(scaled features)")
+x_labels.append("Mean Imputation")
 
 
 # %%
-# Impute missing values with mean
-# -------------------------------
+# kNN-imputation of the missing values
+# ------------------------------------
+#
+# :class:`~sklearn.impute.KNNImputer` imputes missing values using the weighted
+# or unweighted mean of the desired number of nearest neighbors. If your features
+# have vastly different scales (as in the California housing dataset),
+# consider re-scaling them to potentially improve performance.
 #
 
-
-def get_impute_mean(X_missing, y_missing):
-    imputer = SimpleImputer(missing_values=np.nan, strategy="mean", add_indicator=True)
-    mean_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
-    return mean_impute_scores.mean(), mean_impute_scores.std()
-
-
-mses_california[3], stds_california[3] = get_impute_mean(
-    X_miss_california, y_miss_california
+imputer = KNNImputer(add_indicator=True)
+mses_diabetes[3], stds_diabetes[3] = get_score(
+    X_miss_diabetes, y_miss_diabetes, imputer
 )
-mses_diabetes[3], stds_diabetes[3] = get_impute_mean(X_miss_diabetes, y_miss_diabetes)
-x_labels.append("Mean Imputation")
+mses_california[3], stds_california[3] = get_score(
+    X_miss_california, y_miss_california, make_pipeline(RobustScaler(), imputer)
+)
+x_labels.append("KNN Imputation")
 
 
 # %%
@@ -219,32 +192,21 @@ def get_impute_mean(X_missing, y_missing):
 # ------------------------------------------
 #
 # Another option is the :class:`~sklearn.impute.IterativeImputer`. This uses
-# round-robin linear regression, modeling each feature with missing values as a
-# function of other features, in turn.
-# The version implemented assumes Gaussian (output) variables. If your features
-# are obviously non-normal, consider transforming them to look more normal
-# to potentially improve performance.
+# round-robin regression, modeling each feature with missing values as a
+# function of other features, in turn. We use the class's default choice
+# of the regressor model (:class:`~sklearn.linear_model.BayesianRidge`)
+# to predict missing feature values. The performance of the predictor
+# may be negatively affected by vastly different scales of the features,
+# so we re-scale the features in the California housing dataset.
 #
 
+imputer = IterativeImputer(add_indicator=True)
 
-def get_impute_iterative(X_missing, y_missing):
-    imputer = IterativeImputer(
-        missing_values=np.nan,
-        add_indicator=True,
-        random_state=0,
-        n_nearest_features=3,
-        max_iter=1,
-        sample_posterior=True,
-    )
-    iterative_impute_scores = get_scores_for_imputer(imputer, X_missing, y_missing)
-    return iterative_impute_scores.mean(), iterative_impute_scores.std()
-
-
-mses_california[4], stds_california[4] = get_impute_iterative(
-    X_miss_california, y_miss_california
+mses_diabetes[4], stds_diabetes[4] = get_score(
+    X_miss_diabetes, y_miss_diabetes, imputer
 )
-mses_diabetes[4], stds_diabetes[4] = get_impute_iterative(
-    X_miss_diabetes, y_miss_diabetes
+mses_california[4], stds_california[4] = get_score(
+    X_miss_california, y_miss_california, make_pipeline(RobustScaler(), imputer)
 )
 x_labels.append("Iterative Imputation")
 

From be1e95c23642e0ebe82124cd25b6a69918c82da4 Mon Sep 17 00:00:00 2001
From: ayrat <ayrat.khalimov@gmail.com>
Date: Thu, 5 Jun 2025 11:49:48 +0200
Subject: [PATCH 6/7] changes suggested by @betatim; removed unnecessary global
 variables (target_estimator) and re-initialization of rng

---
 ...t_iterative_imputer_variants_comparison.py | 20 ++++++++----------
 examples/impute/plot_missing_values.py        | 21 +++++++++----------
 2 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 69eb233520b68..9fafd3d120a6c 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -66,8 +66,6 @@
 
 N_SPLITS = 5
 
-rng = np.random.RandomState(0)
-
 X_full, y_full = fetch_california_housing(return_X_y=True)
 # ~2k samples is enough for the purpose of the example.
 # Remove the following two lines for a slower run with different error bars.
@@ -75,17 +73,15 @@
 y_full = y_full[::10]
 n_samples, n_features = X_full.shape
 
-target_estimator = BayesianRidge()
-
 
-def compute_score_for(X, y, imputer):
+def compute_score_for(X, y, imputer=None):
     # We scale data before imputation and training a target estimator,
     # because our target estimator and some of the imputers assume
     # that the features have similar scales.
     if imputer is None:
-        estimator = make_pipeline(RobustScaler(), target_estimator)
+        estimator = make_pipeline(RobustScaler(), BayesianRidge())
     else:
-        estimator = make_pipeline(RobustScaler(), imputer, target_estimator)
+        estimator = make_pipeline(RobustScaler(), imputer, BayesianRidge())
     return cross_val_score(
         estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS
     )
@@ -93,11 +89,12 @@ def compute_score_for(X, y, imputer):
 
 # Estimate the score on the entire dataset, with no missing values
 score_full_data = pd.DataFrame(
-    compute_score_for(X_full, y_full, None),
+    compute_score_for(X_full, y_full),
     columns=["Full Data"],
 )
 
 # Add a single missing value to each row
+rng = np.random.RandomState(0)
 X_missing = X_full.copy()
 y_missing = y_full
 missing_samples = np.arange(n_samples)
@@ -106,9 +103,10 @@ def compute_score_for(X, y, imputer):
 
 # Estimate the score after imputation (mean and median strategies)
 score_simple_imputer = pd.DataFrame()
-score_simple_imputer["mean"] = compute_score_for(
-    X_missing, y_missing, SimpleImputer(strategy="mean")
-)
+for strategy in ("mean", "median"):
+    score_simple_imputer[strategy] = compute_score_for(
+        X_missing, y_missing, SimpleImputer(strategy=strategy)
+    )
 
 # Estimate the score after iterative imputation of the missing values
 # with different estimators
diff --git a/examples/impute/plot_missing_values.py b/examples/impute/plot_missing_values.py
index e046280b70d82..c7474eb338357 100644
--- a/examples/impute/plot_missing_values.py
+++ b/examples/impute/plot_missing_values.py
@@ -47,8 +47,6 @@
 
 from sklearn.datasets import fetch_california_housing, load_diabetes
 
-rng = np.random.RandomState(42)
-
 X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
 X_california, y_california = fetch_california_housing(return_X_y=True)
 
@@ -58,7 +56,7 @@
 y_california = y_california[:300]
 
 
-def add_missing_values(X_full, y_full):
+def add_missing_values(X_full, y_full, rng):
     n_samples, n_features = X_full.shape
 
     # Add missing values in 75% of the lines
@@ -77,8 +75,11 @@ def add_missing_values(X_full, y_full):
     return X_missing, y_missing
 
 
-X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes)
-X_miss_california, y_miss_california = add_missing_values(X_california, y_california)
+rng = np.random.RandomState(42)
+X_miss_diabetes, y_miss_diabetes = add_missing_values(X_diabetes, y_diabetes, rng)
+X_miss_california, y_miss_california = add_missing_values(
+    X_california, y_california, rng
+)
 
 
 # %%
@@ -90,8 +91,6 @@ def add_missing_values(X_full, y_full):
 # regression.
 #
 
-rng = np.random.RandomState(0)
-
 from sklearn.ensemble import RandomForestRegressor
 
 # To use the experimental IterativeImputer, we need to explicitly ask for it:
@@ -102,10 +101,10 @@ def add_missing_values(X_full, y_full):
 from sklearn.preprocessing import RobustScaler
 
 N_SPLITS = 4
-regressor = RandomForestRegressor(random_state=0)
 
 
-def get_score(X, y, imputer):
+def get_score(X, y, imputer=None):
+    regressor = RandomForestRegressor(random_state=0)
     if imputer is not None:
         estimator = make_pipeline(imputer, regressor)
     else:
@@ -130,8 +129,8 @@ def get_score(X, y, imputer):
 #
 
 
-mses_diabetes[0], stds_diabetes[0] = get_score(X_diabetes, y_diabetes, None)
-mses_california[0], stds_california[0] = get_score(X_california, y_california, None)
+mses_diabetes[0], stds_diabetes[0] = get_score(X_diabetes, y_diabetes)
+mses_california[0], stds_california[0] = get_score(X_california, y_california)
 x_labels.append("Full Data")
 
 

From 15acf02aa4487fcc5f875fcc16729172c9714f17 Mon Sep 17 00:00:00 2001
From: Ayrat <ayrat.khalimov@gmail.com>
Date: Thu, 5 Jun 2025 15:05:08 +0200
Subject: [PATCH 7/7] Update
 examples/impute/plot_iterative_imputer_variants_comparison.py

Co-authored-by: Tim Head <betatim@gmail.com>
---
 examples/impute/plot_iterative_imputer_variants_comparison.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/impute/plot_iterative_imputer_variants_comparison.py b/examples/impute/plot_iterative_imputer_variants_comparison.py
index 9fafd3d120a6c..854d443b229d0 100644
--- a/examples/impute/plot_iterative_imputer_variants_comparison.py
+++ b/examples/impute/plot_iterative_imputer_variants_comparison.py
@@ -137,7 +137,7 @@ def compute_score_for(X, y, imputer=None):
     ),
 ]
 score_iterative_imputer = pd.DataFrame()
-# Iterative imputer is sensible to the tolerance and
+# Iterative imputer is sensitive to the tolerance and
 # dependent on the estimator used internally.
 # We tuned the tolerance to keep this example run with limited computational
 # resources while not changing the results too much compared to keeping the