scikit-learn · thomasjpfan · Jun 22, 2023 · Jun 22, 2023 · adrinjalali · Jun 22, 2023
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -43,6 +43,13 @@ Changes impacting all modules
   to work with our estimators and functions.
   :pr:`26464` by `Thomas Fan`_.
 
+:mod:`sklearn.compose`
+......................
+
+- |Feature| Adds polars input support to :class:`compose.ColumnTransformer` through the
+  `DataFrame API specification <https://data-apis.org/dataframe-api/draft/index.html>`__.
+  :pr:`26669` by `Thomas Fan`_.
+
 Code and Documentation Contributors
 -----------------------------------
 

diff --git a/sklearn/compose/_column_transformer.py b/sklearn/compose/_column_transformer.py
@@ -620,7 +620,7 @@ def _validate_output(self, result):
             name for name, _, _, _ in self._iter(fitted=True, replace_strings=True)
         ]
         for Xs, name in zip(result, names):
-            if not getattr(Xs, "ndim", 0) == 2:
+            if not getattr(Xs, "ndim", 0) == 2 and not hasattr(Xs, "__dataframe__"):
                 raise ValueError(
                     "The output of the '{0}' transformer should be 2D (scipy "
                     "matrix, array, or pandas DataFrame).".format(name)

diff --git a/sklearn/compose/tests/test_column_transformer.py b/sklearn/compose/tests/test_column_transformer.py
@@ -24,6 +24,7 @@
     StandardScaler,
 )
 from sklearn.utils._testing import (
+    _convert_container,
     assert_allclose_dense_sparse,
     assert_almost_equal,
     assert_array_equal,
@@ -39,7 +40,7 @@ def transform(self, X, y=None):
         if hasattr(X, "to_frame"):
             return X.to_frame()
         # 1D array -> 2D array
-        if X.ndim == 1:
+        if hasattr(X, "ndim") and X.ndim == 1:
             return np.atleast_2d(X).T
         return X
 
@@ -160,11 +161,17 @@ def test_column_transformer_tuple_transformers_parameter():
     )
 
 
-def test_column_transformer_dataframe():
-    pd = pytest.importorskip("pandas")
+@pytest.mark.parametrize("constructor_name", ["dataframe", "polars"])
+def test_column_transformer_dataframe(constructor_name):
+    if constructor_name == "dataframe":
+        dataframe_lib = pytest.importorskip("pandas")
+    else:
+        dataframe_lib = pytest.importorskip(constructor_name)
 
     X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
-    X_df = pd.DataFrame(X_array, columns=["first", "second"])
+    X_df = _convert_container(
+        X_array, constructor_name, columns_name=["first", "second"]
+    )
 
     X_res_first = np.array([0, 1, 2]).reshape(-1, 1)
     X_res_both = X_array
@@ -190,10 +197,17 @@ def test_column_transformer_dataframe():
         (slice(0, 2), X_res_both),
         # boolean mask
         (np.array([True, False]), X_res_first),
-        (pd.Series([True, False], index=["first", "second"]), X_res_first),
         ([True, False], X_res_first),
     ]
 
+    if constructor_name == "dataframe":
+        cases.append(
+            (
+                dataframe_lib.Series([True, False], index=["first", "second"]),
+                X_res_first,
+            )
+        )
+
     for selection, res in cases:
         ct = ColumnTransformer([("trans", Trans(), selection)], remainder="drop")
         assert_array_equal(ct.fit_transform(X_df), res)
@@ -261,8 +275,9 @@ def fit(self, X, y=None):
             return self
 
         def transform(self, X, y=None):
-            assert isinstance(X, (pd.DataFrame, pd.Series))
-            if isinstance(X, pd.Series):
+            assert isinstance(X, (dataframe_lib.DataFrame, dataframe_lib.Series))
+
+            if X.__class__.__name__ == "Series":
                 X = X.to_frame()
             return X
 
@@ -271,17 +286,18 @@ def transform(self, X, y=None):
     ct = ColumnTransformer([("trans", TransAssert(), ["first", "second"])])
     ct.fit_transform(X_df)
 
-    # integer column spec + integer column names -> still use positional
-    X_df2 = X_df.copy()
-    X_df2.columns = [1, 0]
-    ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop")
-    assert_array_equal(ct.fit_transform(X_df2), X_res_first)
-    assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first)
-
-    assert len(ct.transformers_) == 2
-    assert ct.transformers_[-1][0] == "remainder"
-    assert ct.transformers_[-1][1] == "drop"
-    assert_array_equal(ct.transformers_[-1][2], [1])
+    if constructor_name == "dataframe":
+        # integer column spec + integer column names -> still use positional
+        X_df2 = X_df.copy()
+        X_df2.columns = [1, 0]
+        ct = ColumnTransformer([("trans", Trans(), 0)], remainder="drop")
+        assert_array_equal(ct.fit_transform(X_df2), X_res_first)
+        assert_array_equal(ct.fit(X_df2).transform(X_df2), X_res_first)
+
+        assert len(ct.transformers_) == 2
+        assert ct.transformers_[-1][0] == "remainder"
+        assert ct.transformers_[-1][1] == "drop"
+        assert_array_equal(ct.transformers_[-1][2], [1])
 
 
 @pytest.mark.parametrize("pandas", [True, False], ids=["pandas", "numpy"])

diff --git a/sklearn/externals/_dataframe_api/LICENSE b/sklearn/externals/_dataframe_api/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023, Marco Gorelli
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/sklearn/externals/_dataframe_api/README.md b/sklearn/externals/_dataframe_api/README.md
@@ -0,0 +1 @@
+The files in this folder is vendoered from https://github.com/MarcoGorelli/impl-dataframe-api
diff --git a/sklearn/externals/_dataframe_api/__init__.py b/sklearn/externals/_dataframe_api/__init__.py
@@ -0,0 +1,25 @@
+from ...utils.validation import _is_pandas_df
+from ...utils.validation import _is_polars_df
+
+__all__ = ["get_dataframe_standard", "has_supported_dataframe_standards"]
+
+
+def get_dataframe_standard(df):
+    if hasattr(df, "__dataframe_standard__"):
+        return df.__dataframe_standard__()
+    elif _is_pandas_df(df):
+        from .pandas_standard import dataframe_standard as pandas_dataframe_standard
+
+        return pandas_dataframe_standard(df)
+    elif _is_polars_df(df):
+        from .polars_standard import dataframe_standard as polars_dataframe_standard
+
+        return polars_dataframe_standard(df)
+    else:
+        raise ValueError("Only pandas and polars DataFrames are supported.")
+
+
+def has_supported_dataframe_standards(df):
+    return (
+        hasattr(df, "__dataframe_standard__") or _is_pandas_df(df) or _is_polars_df(df)
+    )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		The files in this folder is vendoered from https://github.com/MarcoGorelli/impl-dataframe-api