scikit-learn · adrinjalali · Dec 4, 2023 · Jun 23, 2023 · Jun 23, 2023 · Jun 23, 2023
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -272,6 +272,11 @@ Changelog
 :mod:`sklearn.compose`
 ......................
 
+- |MajorFeature| Adds `polars <https://www.pola.rs>`__ input support to
+  :class:`compose.ColumnTransformer` through the `DataFrame Interchange Protocol
+  <https://data-apis.org/dataframe-protocol/latest/purpose_and_scope.html>`__.
+  The minimum supported version for polars is `0.19.12`. :pr:`26683` by `Thomas Fan`_.
+
 - |API| |FIX| :class:`~compose.ColumnTransformer` now replaces `"passthrough"`
   with a corresponding :class:`~preprocessing.FunctionTransformer` in the
   fitted ``transformers_`` attribute. :pr:`27204` by `Adrin Jalali`_.

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -37,6 +37,8 @@
 from ..utils.parallel import Parallel, delayed
 from ..utils.validation import (
     _check_feature_names_in,
+    _get_feature_names,
+    _is_pandas_df,
     _num_samples,
     check_array,
     check_is_fitted,
@@ -500,9 +502,8 @@ def _validate_remainder(self, X):
         Validates ``remainder`` and defines ``_remainder`` targeting
         the remaining columns.
         """
-        self._n_features = X.shape[1]
         cols = set(chain(*self._transformer_to_input_indices.values()))
-        remaining = sorted(set(range(self._n_features)) - cols)
+        remaining = sorted(set(range(self.n_features_in_)) - cols)
         self._remainder = ("remainder", self.remainder, remaining)
         self._transformer_to_input_indices["remainder"] = remaining
 
@@ -676,10 +677,10 @@ def _validate_output(self, result):
             )
         ]
         for Xs, name in zip(result, names):
-            if not getattr(Xs, "ndim", 0) == 2:
+            if not getattr(Xs, "ndim", 0) == 2 and not hasattr(Xs, "__dataframe__"):
                 raise ValueError(
-                    "The output of the '{0}' transformer should be 2D (scipy "
-                    "matrix, array, or pandas DataFrame).".format(name)
+                    "The output of the '{0}' transformer should be 2D (numpy array, "
+                    "scipy sparse array, dataframe).".format(name)
                 )
 
     def _record_output_indices(self, Xs):
@@ -867,6 +868,8 @@ def fit_transform(self, X, y=None, **params):
         # set n_features_in_ attribute
         self._check_n_features(X, reset=True)
         self._validate_transformers()
+        n_samples = _num_samples(X)
+
         self._validate_column_callables(X)
         self._validate_remainder(X)
 
@@ -886,7 +889,7 @@ def fit_transform(self, X, y=None, **params):
         if not result:
             self._update_fitted_transformers([])
             # All transformers are None
-            return np.zeros((X.shape[0], 0))
+            return np.zeros((n_samples, 0))
 
         Xs, transformers = zip(*result)
 
@@ -942,9 +945,12 @@ def transform(self, X, **params):
         # enables the user to pass X at transform time with extra columns which
         # were not present in fit time, and the order of the columns doesn't
         # matter.
-        fit_dataframe_and_transform_dataframe = hasattr(
-            self, "feature_names_in_"
-        ) and hasattr(X, "columns")
+        fit_dataframe_and_transform_dataframe = hasattr(self, "feature_names_in_") and (
+            _is_pandas_df(X) or hasattr(X, "__dataframe__")
+        )
+
+        n_samples = _num_samples(X)
+        column_names = _get_feature_names(X)
 
         if fit_dataframe_and_transform_dataframe:
             named_transformers = self.named_transformers_
@@ -959,7 +965,7 @@ def transform(self, X, **params):
             all_indices = set(chain(*non_dropped_indices))
             all_names = set(self.feature_names_in_[ind] for ind in all_indices)
 
-            diff = all_names - set(X.columns)
+            diff = all_names - set(column_names)
             if diff:
                 raise ValueError(f"columns are missing: {diff}")
         else:
@@ -983,7 +989,7 @@ def transform(self, X, **params):
 
         if not Xs:
             # All transformers are None
-            return np.zeros((X.shape[0], 0))
+            return np.zeros((n_samples, 0))
 
         return self._hstack(list(Xs))
 
@@ -1138,8 +1144,8 @@ def get_metadata_routing(self):
 
 
 def _check_X(X):
-    """Use check_array only on lists and other non-array-likes / sparse"""
-    if hasattr(X, "__array__") or sparse.issparse(X):
+    """Use check_array only when necessary, e.g. on lists and other non-array-likes."""
+    if hasattr(X, "__array__") or hasattr(X, "__dataframe__") or sparse.issparse(X):
         return X
     return check_array(X, force_all_finite="allow-nan", dtype=object)
 

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -29,6 +29,7 @@
     check_recorded_metadata,
 )
 from sklearn.utils._testing import (
+    _convert_container,
     assert_allclose_dense_sparse,
     assert_almost_equal,
     assert_array_equal,
@@ -45,7 +46,7 @@ def transform(self, X, y=None):
         if hasattr(X, "to_frame"):
             return X.to_frame()
         # 1D array -> 2D array
-        if X.ndim == 1:
+        if getattr(X, "ndim", 2) == 1:
             return np.atleast_2d(X).T
         return X
 
@@ -169,27 +170,29 @@ def test_column_transformer_tuple_transformers_parameter():
     )
 
 
-def test_column_transformer_dataframe():
-    pd = pytest.importorskip("pandas")
+@pytest.mark.parametrize("constructor_name", ["dataframe", "polars"])
+def test_column_transformer_dataframe(constructor_name):
+    if constructor_name == "dataframe":
+        dataframe_lib = pytest.importorskip("pandas")
+    else:
+        dataframe_lib = pytest.importorskip(constructor_name)
 
     X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
-    X_df = pd.DataFrame(X_array, columns=["first", "second"])
+    X_df = _convert_container(
+        X_array, constructor_name, columns_name=["first", "second"]
+    )
 
     X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
     X_res_both = X_array
 
     cases = [
         # String keys: label based
-        # scalar
-        ("first", X_res_first),
         # list
         (["first"], X_res_first),
         (["first", "second"], X_res_both),
         # slice
         (slice("first", "second"), X_res_both),
         # int keys: positional
-        # scalar
-        (0, X_res_first),
         # list
         ([0], X_res_first),
         ([0, 1], X_res_both),
@@ -199,9 +202,21 @@ def test_column_transformer_dataframe():
         (slice(0, 2), X_res_both),
         # boolean mask
         (np.array([True, False]), X_res_first),
-        (pd.Series([True, False], index=["first", "second"]), X_res_first),
         ([True, False], X_res_first),
     ]
+    if constructor_name == "dataframe":
+        # Scalars are only supported for pandas dataframes.
+        cases.extend(
+            [
+                # scalar
+                (0, X_res_first),
+                ("first", X_res_first),
+                (
+                    dataframe_lib.Series([True, False], index=["first", "second"]),
+                    X_res_first,
+                ),
+            ]
+        )
 
     for selection, res in cases:
         ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
@@ -274,37 +289,49 @@ def fit(self, X, y=None):
 
         def transform(self, X, y=None):
             assert isinstance(X, self.expected_type_transform)
-            if isinstance(X, pd.Series):
+            if isinstance(X, dataframe_lib.Series):
                 X = X.to_frame()
             return X
 
-    ct = ColumnTransformer(
-        [("trans", TransAssert(expected_type_transform=pd.Series), "first")],
-        remainder="drop",
-    )
-    ct.fit_transform(X_df)
     ct = ColumnTransformer(
         [
             (
                 "trans",
-                TransAssert(expected_type_transform=pd.DataFrame),
+                TransAssert(expected_type_transform=dataframe_lib.DataFrame),
                 ["first", "second"],
             )
         ]
     )
     ct.fit_transform(X_df)
 
-    # integer column spec + integer column names -> still use positional
-    X_df2 = X_df.copy()
-    X_df2.columns = [1, 0]
-    ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop")
-    assert_array_equal(ct.fit_transform(X_df2), X_res_first)
-    assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first)
+    if constructor_name == "dataframe":
+        # DataFrame protocol does not have 1d columns, so we only test on Pandas
+        # dataframes.
+        ct = ColumnTransformer(
+            [
+                (
+                    "trans",
+                    TransAssert(expected_type_transform=dataframe_lib.Series),
+                    "first",
+                )
+            ],
+            remainder="drop",
+        )
+        ct.fit_transform(X_df)
 
-    assert len(ct.transformers_) == 2
-    assert ct.transformers_[-1][0] == "remainder"
-    assert ct.transformers_[-1][1] == "drop"
-    assert_array_equal(ct.transformers_[-1][2], [1])
+        # Only test on pandas because the dataframe protocol requires string column
+        # names
+        # integer column spec + integer column names -> still use positional
+        X_df2 = X_df.copy()
+        X_df2.columns = [1, 0]
+        ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop")
+        assert_array_equal(ct.fit_transform(X_df2), X_res_first)
+        assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first)
+
+        assert len(ct.transformers_) == 2
+        assert ct.transformers_[-1][0] == "remainder"
+        assert ct.transformers_[-1][1] == "drop"
+        assert_array_equal(ct.transformers_[-1][2], [1])
 
 
 @pytest.mark.parametrize("pandas", [True, False], ids=["pandas", "numpy"])
@@ -2248,6 +2275,32 @@ def test_remainder_set_output():
     assert isinstance(out, np.ndarray)
 
 
+def test_dataframe_different_dataframe_libraries():
+    """Check fitting and transforming on pandas and polars dataframes."""
+    pd = pytest.importorskip("pandas")
+    pl = pytest.importorskip("polars")
+    X_train_np = np.array([[0, 1], [2, 4], [4, 5]])
+    X_test_np = np.array([[1, 2], [1, 3], [2, 3]])
+
+    # Fit on pandas and transform on polars
+    X_train_pd = pd.DataFrame(X_train_np, columns=["a", "b"])
+    X_test_pl = pl.DataFrame(X_test_np, schema=["a", "b"])
+
+    ct = make_column_transformer((Trans(), [0, 1]))
+    ct.fit(X_train_pd)
+
+    out_pl_in = ct.transform(X_test_pl)
+    assert_array_equal(out_pl_in, X_test_np)
+
+    # Fit on polars and transform on pandas
+    X_train_pl = pl.DataFrame(X_train_np, schema=["a", "b"])
+    X_test_pd = pd.DataFrame(X_test_np, columns=["a", "b"])
+    ct.fit(X_train_pl)
+
+    out_pd_in = ct.transform(X_test_pd)
+    assert_array_equal(out_pd_in, X_test_np)
+
+
 # Metadata Routing Tests
 # ======================
 

diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -14,6 +14,7 @@
 
 import sklearn
 from sklearn._config import config_context
+from sklearn._min_dependencies import dependent_packages
 from sklearn.base import BaseEstimator
 from sklearn.datasets import make_blobs
 from sklearn.ensemble import RandomForestRegressor
@@ -70,6 +71,7 @@
     _get_feature_names,
     _is_fitted,
     _is_pandas_df,
+    _is_polars_df,
     _num_features,
     _num_samples,
     assert_all_finite,
@@ -1753,6 +1755,38 @@ def test_is_pandas_df_pandas_not_installed(hide_available_pandas):
     assert not _is_pandas_df(1)
 
 
+@pytest.mark.parametrize(
+    "constructor_name, minversion",
+    [
+        ("pyarrow", dependent_packages["pyarrow"][0]),
+        ("dataframe", dependent_packages["pandas"][0]),
+        ("polars", dependent_packages["polars"][0]),
+    ],
+)
+def test_is_polars_df_other_libraries(constructor_name, minversion):
+    df = _convert_container(
+        [[1, 4, 2], [3, 3, 6]],
+        constructor_name,
+        minversion=minversion,
+    )
+    if constructor_name in ("pyarrow", "dataframe"):
+        assert not _is_polars_df(df)
+    else:
+        assert _is_polars_df(df)
+
+
+def test_is_polars_df_pandas_not_installed():
 if hasattr(X, "columns") and hasattr(X, "schema"): 
 if hasattr(X, "columns") and hasattr(X, "schema"): 
+    """Check _is_polars_df for object that looks like a polars dataframe"""
+
+    class NotAPolarsDataFrame:
+        def __init__(self):
+            self.columns = [1, 2, 3]
+            self.schema = "my_schema"
+
+    not_a_polars_df = NotAPolarsDataFrame()
+    assert not _is_polars_df(not_a_polars_df)
+
+
 def test_get_feature_names_numpy():
     """Get feature names return None for numpy arrays."""
     X = np.array([[1, 2, 3], [4, 5, 6]])
@@ -1957,6 +1991,14 @@ def test_check_array_multiple_extensions(
     assert_array_equal(X_regular_checked, X_extension_checked)
 
 
+def test_num_samples_dataframe_protocol():
+    """Use DataFrame protocol to get n_samples from polars dataframe."""
-    """Use DataFrame protocol to get n_samples from polars dataframe."""
+    """Use the DataFrame interchange protocol to get n_samples from polars."""
-    """Use DataFrame protocol to get n_samples from polars dataframe."""
+    """Use the DataFrame interchange protocol to get n_samples from polars."""
+    pl = pytest.importorskip("polars")
+
+    df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    assert _num_samples(df) == 3
+
+
 @pytest.mark.parametrize(
     "sparse_container",
     CSR_CONTAINERS + CSC_CONTAINERS + COO_CONTAINERS + DIA_CONTAINERS,

diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -348,6 +348,9 @@ def _num_samples(x):
         # Don't get num_samples from an ensembles length!
         raise TypeError(message)
 
+    if _use_interchange_protocol(x):
+        return x.__dataframe__().num_rows()
+
     if not hasattr(x, "__len__") and not hasattr(x, "shape"):
         if hasattr(x, "__array__"):
             x = np.asarray(x)