scikit-learn · glemaitre · May 12, 2022 · Mar 7, 2022 · Mar 8, 2022 · Mar 10, 2022
diff --git a/asv_benchmarks/benchmarks/datasets.py b/asv_benchmarks/benchmarks/datasets.py
@@ -59,7 +59,9 @@ def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), dtype=np.float
 
 @M.cache
 def _mnist_dataset(dtype=np.float32):
-    X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
+    X, y = fetch_openml(
+        "mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
+    )
     X = X.astype(dtype, copy=False)
     X = MaxAbsScaler().fit_transform(X)
 

diff --git a/benchmarks/bench_hist_gradient_boosting_adult.py b/benchmarks/bench_hist_gradient_boosting_adult.py
@@ -2,12 +2,15 @@
 from time import time
 
 import numpy as np
+import pandas as pd
 
 from sklearn.model_selection import train_test_split
+from sklearn.compose import make_column_transformer, make_column_selector
 from sklearn.datasets import fetch_openml
 from sklearn.metrics import accuracy_score, roc_auc_score
 from sklearn.ensemble import HistGradientBoostingClassifier
 from sklearn.ensemble._hist_gradient_boosting.utils import get_equivalent_estimator
+from sklearn.preprocessing import OrdinalEncoder
 
 
 parser = argparse.ArgumentParser()
@@ -47,22 +50,32 @@ def predict(est, data_test, target_test):
     print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
 
 
-data = fetch_openml(data_id=179, as_frame=False)  # adult dataset
+data = fetch_openml(data_id=179, as_frame=True, parser="pandas")  # adult dataset
 X, y = data.data, data.target
 
+# Ordinal encode the categories to use the native support available in HGBDT
+cat_columns = make_column_selector(dtype_include="category")(X)
+preprocessing = make_column_transformer(
+    (OrdinalEncoder(), cat_columns),
+    remainder="passthrough",
+    verbose_feature_names_out=False,
+)
+X = pd.DataFrame(
+    preprocessing.fit_transform(X),
+    columns=preprocessing.get_feature_names_out(),
+)
+
 n_classes = len(np.unique(y))
 n_features = X.shape[1]
-n_categorical_features = len(data.categories)
+n_categorical_features = len(cat_columns)
 n_numerical_features = n_features - n_categorical_features
 print(f"Number of features: {n_features}")
 print(f"Number of categorical features: {n_categorical_features}")
 print(f"Number of numerical features: {n_numerical_features}")
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
 
-# Note: no need to use an OrdinalEncoder because categorical features are
-# already clean
-is_categorical = [name in data.categories for name in data.feature_names]
+is_categorical = [True] * n_categorical_features + [False] * n_numerical_features
 est = HistGradientBoostingClassifier(
     loss="log_loss",
     learning_rate=lr,

diff --git a/benchmarks/bench_isolation_forest.py b/benchmarks/bench_isolation_forest.py
@@ -64,9 +64,9 @@ def print_outlier_ratio(y):
         y = dataset.target
 
     if dat == "shuttle":
-        dataset = fetch_openml("shuttle")
+        dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
         X = dataset.data
-        y = dataset.target
+        y = dataset.target.astype(np.int64)
         X, y = sh(X, y, random_state=random_state)
         # we remove data with label 4
         # normal data are then those of class 1

diff --git a/benchmarks/bench_lof.py b/benchmarks/bench_lof.py
@@ -44,9 +44,9 @@
         y = dataset.target
 
     if dataset_name == "shuttle":
-        dataset = fetch_openml("shuttle")
+        dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
         X = dataset.data
-        y = dataset.target
+        y = dataset.target.astype(np.int64)
         # we remove data with label 4
         # normal data are then those of class 1
         s = y != 4

diff --git a/benchmarks/bench_mnist.py b/benchmarks/bench_mnist.py
@@ -62,7 +62,7 @@ def load_data(dtype=np.float32, order="F"):
     ######################################################################
     # Load dataset
     print("Loading dataset...")
-    data = fetch_openml("mnist_784")
+    data = fetch_openml("mnist_784", as_frame=True, parser="pandas")
     X = check_array(data["data"], dtype=dtype, order=order)
     y = data["target"]
 

diff --git a/benchmarks/bench_plot_randomized_svd.py b/benchmarks/bench_plot_randomized_svd.py
@@ -191,7 +191,7 @@ def get_data(dataset_name):
         del row
         del col
     else:
-        X = fetch_openml(dataset_name).data
+        X = fetch_openml(dataset_name, parser="auto").data
     return X
 
 
@@ -281,9 +281,9 @@ def svd_timing(
         U, mu, V = randomized_svd(
             X,
             n_comps,
-            n_oversamples,
-            n_iter,
-            power_iteration_normalizer,
+            n_oversamples=n_oversamples,
+            n_iter=n_iter,
+            power_iteration_normalizer=power_iteration_normalizer,
             random_state=random_state,
             transpose=False,
         )

diff --git a/benchmarks/bench_tsne_mnist.py b/benchmarks/bench_tsne_mnist.py
@@ -35,7 +35,7 @@
 def load_data(dtype=np.float32, order="C", shuffle=True, seed=0):
     """Load the data, then cache and memmap the train/test split"""
     print("Loading dataset...")
-    data = fetch_openml("mnist_784")
+    data = fetch_openml("mnist_784", as_frame=True, parser="pandas")
 
     X = check_array(data["data"], dtype=dtype, order=order)
     y = data["target"]

diff --git a/doc/datasets/loading_other_datasets.rst b/doc/datasets/loading_other_datasets.rst
@@ -99,7 +99,7 @@ from the repository using the function
 For example, to download a dataset of gene expressions in mice brains::
 
   >>> from sklearn.datasets import fetch_openml
-  >>> mice = fetch_openml(name='miceprotein', version=4)
+  >>> mice = fetch_openml(name='miceprotein', version=4, parser="auto")
 
 To fully specify a dataset, you need to provide a name and a version, though
 the version is optional, see :ref:`openml_versions` below.
@@ -147,7 +147,7 @@ dataset on the openml website::
 
 The ``data_id`` also uniquely identifies a dataset from OpenML::
 
-  >>> mice = fetch_openml(data_id=40966)
+  >>> mice = fetch_openml(data_id=40966, parser="auto")
   >>> mice.details # doctest: +SKIP
   {'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
   'creator': ...,
@@ -171,8 +171,8 @@ which can contain entirely different datasets.
 If a particular version of a dataset has been found to contain significant
 issues, it might be deactivated. Using a name to specify a dataset will yield
 the earliest version of a dataset that is still active. That means that
-``fetch_openml(name="miceprotein")`` can yield different results at different
-times if earlier versions become inactive.
+``fetch_openml(name="miceprotein", parser="auto")`` can yield different results
+at different times if earlier versions become inactive.
 You can see that the dataset with ``data_id`` 40966 that we fetched above is
 the first version of the "miceprotein" dataset::
 
@@ -182,19 +182,19 @@ the first version of the "miceprotein" dataset::
 In fact, this dataset only has one version. The iris dataset on the other hand
 has multiple versions::
 
-  >>> iris = fetch_openml(name="iris")
+  >>> iris = fetch_openml(name="iris", parser="auto")
   >>> iris.details['version']  #doctest: +SKIP
   '1'
   >>> iris.details['id']  #doctest: +SKIP
   '61'
 
-  >>> iris_61 = fetch_openml(data_id=61)
+  >>> iris_61 = fetch_openml(data_id=61, parser="auto")
   >>> iris_61.details['version']
   '1'
   >>> iris_61.details['id']
   '61'
 
-  >>> iris_969 = fetch_openml(data_id=969)
+  >>> iris_969 = fetch_openml(data_id=969, parser="auto")
   >>> iris_969.details['version']
   '3'
   >>> iris_969.details['id']
@@ -212,7 +212,7 @@ binarized version of the data::
 You can also specify both the name and the version, which also uniquely
 identifies the dataset::
 
-  >>> iris_version_3 = fetch_openml(name="iris", version=3)
+  >>> iris_version_3 = fetch_openml(name="iris", version=3, parser="auto")
   >>> iris_version_3.details['version']
   '3'
   >>> iris_version_3.details['id']
@@ -225,6 +225,45 @@ identifies the dataset::
    machine learning" ACM SIGKDD Explorations Newsletter, 15(2), 49-60, 2014.
    <1407.7722>`
 
+.. _openml_parser:
+
+ARFF parser
+~~~~~~~~~~~
+
+From version 1.2, scikit-learn provides a new keyword argument `parser` that
+provides several options to parse the ARFF files provided by OpenML. The legacy
+parser (i.e. `parser="liac-arff"`) is based on the project
+`LIAC-ARFF <https://github.com/renatopp/liac-arff>`_. This parser is however
+slow and consume more memory than required. A new parser based on pandas
+(i.e. `parser="pandas"`) is both faster and more memory efficient.
+However, this parser does not support sparse data.
+Therefore, we recommend using `parser="auto"` which will use the best parser
+available for the requested dataset.
+
+The `"pandas"` and `"liac-arff"` parsers can lead to different data types in
+the output. The notable differences are the following:
+
+- The `"liac-arff"` parser always encodes categorical features as `str`
+  objects. To the contrary, the `"pandas"` parser instead infers the type while
+  reading and numerical categories will be casted into integers whenever
+  possible.
+- The `"liac-arff"` parser uses float64 to encode numerical features tagged as
+  'REAL' and 'NUMERICAL' in the metadata. The `"pandas"` parser instead infers
+  if these numerical features corresponds to integers and uses panda's Integer
+  extension dtype.
+- In particular, classification datasets with integer categories are typically
+  loaded as such `(0, 1, ...)` with the `"pandas"` parser while `"liac-arff"`
+  will force the use of string encoded class labels such as `"0"`, `"1"` and so
+  on.
+
+In addition, when `as_frame=False` is used, the `"liac-arff"` parser returns
+ordinally encoded data where the categories are provided in the attribute
+`categories` of the `Bunch` instance. Instead, `"pandas"` returns a NumPy array
+were the categories. Then it's up to the user to design a feature
+engineering pipeline with an instance of  `OneHotEncoder` or
+`OrdinalEncoder` typically wrapped in a `ColumnTransformer` to
+preprocess the categorical columns explicitly. See for instance: :ref:`sphx_glr_auto_examples_compose_plot_column_transformer_mixed_types.py`.
+
 .. _external_datasets:
 
 Loading from external datasets

diff --git a/doc/whats_new/v1.1.rst b/doc/whats_new/v1.1.rst
@@ -26,7 +26,7 @@ Changelog
   classifier that always predicts the positive class: recall=100% and
   precision=class balance.
   :pr:`23214` by :user:`Stéphane Collot <stephanecollot>` and :user:`Max Baak <mbaak>`.
-  
+
 :mod:`sklearn.utils`
 ....................
 
@@ -208,7 +208,7 @@ Changelog
   :pr:`23194` by `Thomas Fan`_.
 
 - |Enhancement| Added an extension in doc/conf.py to automatically generate
-  the list of estimators that handle NaN values. 
+  the list of estimators that handle NaN values.
   :pr:`23198` by `Lise Kleiber <lisekleiber>`_, :user:`Zhehao Liu <MaxwellLZH>`
   and :user:`Chiara Marmo <cmarmo>`.
 

diff --git a/doc/whats_new/v1.2.rst b/doc/whats_new/v1.2.rst
@@ -44,6 +44,20 @@ Changelog
 - |Enhancement| :class:`cluster.Birch` now preserves dtype for `numpy.float32`
   inputs. :pr:`22968` by `Meekail Zain <micky774>`.
 
+:mod:`sklearn.datasets`
+.......................
+
+- |Enhancement| Introduce the new parameter `parser` in
+  :func:`datasets.fetch_openml`. `parser="pandas"` allows to use the very CPU
+  and memory efficient `pandas.read_csv` parser to load dense ARFF
+  formatted dataset files. It is possible to pass `parser="liac-arff"`
+  to use the old LIAC parser.
+  When `parser="auto"`, dense datasets are loaded with "pandas" and sparse
+  datasets are loaded with "liac-arff".
+  Currently, `parser="liac-arff"` by default and will change to `parser="auto"`
+  in version 1.4
+  :pr:`21938` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.ensemble`
 .......................
 

diff --git a/examples/applications/plot_cyclical_feature_engineering.py b/examples/applications/plot_cyclical_feature_engineering.py
@@ -20,7 +20,9 @@
 # We start by loading the data from the OpenML repository.
 from sklearn.datasets import fetch_openml
 
-bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
+bike_sharing = fetch_openml(
+    "Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
+)
 df = bike_sharing.frame
 
 # %%

diff --git a/examples/applications/plot_digits_denoising.py b/examples/applications/plot_digits_denoising.py
@@ -36,7 +36,7 @@
 from sklearn.preprocessing import MinMaxScaler
 from sklearn.model_selection import train_test_split
 
-X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True)
+X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True, parser="pandas")
 X = MinMaxScaler().fit_transform(X)
 
 # %%

diff --git a/examples/compose/plot_column_transformer_mixed_types.py b/examples/compose/plot_column_transformer_mixed_types.py
@@ -43,7 +43,9 @@
 
 # %%
 # Load data from https://www.openml.org/d/40945
-X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
+X, y = fetch_openml(
+    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
+)
 
 # Alternatively X and y can be obtained directly from the frame attribute:
 # X = titanic.frame.drop('survived', axis=1)

diff --git a/examples/compose/plot_transformed_target.py b/examples/compose/plot_transformed_target.py
@@ -128,7 +128,7 @@
 from sklearn.datasets import fetch_openml
 from sklearn.preprocessing import QuantileTransformer, quantile_transform
 
-ames = fetch_openml(name="house_prices", as_frame=True)
+ames = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
 # Keep only numeric columns
 X = ames.data.select_dtypes(np.number)
 # Remove columns with NaN or Inf values

diff --git a/examples/ensemble/plot_gradient_boosting_categorical.py b/examples/ensemble/plot_gradient_boosting_categorical.py
@@ -30,7 +30,7 @@
 # are either categorical or numerical:
 from sklearn.datasets import fetch_openml
 
-X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)
+X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser="pandas")
 
 # Select only a subset of features of X to make the example faster to run
 categorical_columns_subset = [

diff --git a/examples/ensemble/plot_stack_predictors.py b/examples/ensemble/plot_stack_predictors.py
@@ -45,7 +45,7 @@
 
 
 def load_ames_housing():
-    df = fetch_openml(name="house_prices", as_frame=True)
+    df = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
     X = df.data
     y = df.target
 
@@ -117,7 +117,9 @@ def load_ames_housing():
 from sklearn.preprocessing import OrdinalEncoder
 
 cat_tree_processor = OrdinalEncoder(
-    handle_unknown="use_encoded_value", unknown_value=-1
+    handle_unknown="use_encoded_value",
+    unknown_value=-1,
+    encoded_missing_value=-2,
 )
 num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)
 

diff --git a/examples/gaussian_process/plot_gpr_co2.py b/examples/gaussian_process/plot_gpr_co2.py
@@ -36,7 +36,7 @@
 # in OpenML.
 from sklearn.datasets import fetch_openml
 
-co2 = fetch_openml(data_id=41187, as_frame=True)
+co2 = fetch_openml(data_id=41187, as_frame=True, parser="pandas")
 co2.frame.head()
 
 # %%

diff --git a/examples/inspection/plot_linear_model_coefficient_interpretation.py b/examples/inspection/plot_linear_model_coefficient_interpretation.py
@@ -46,7 +46,7 @@
 
 from sklearn.datasets import fetch_openml
 
-survey = fetch_openml(data_id=534, as_frame=True)
+survey = fetch_openml(data_id=534, as_frame=True, parser="pandas")
 
 # %%
 # Then, we identify features `X` and targets `y`: the column WAGE is our

diff --git a/examples/inspection/plot_permutation_importance.py b/examples/inspection/plot_permutation_importance.py
@@ -43,7 +43,9 @@
 from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
 
-X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
+X, y = fetch_openml(
+    "titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
+)
 rng = np.random.RandomState(seed=42)
 X["random_cat"] = rng.randint(3, size=X.shape[0])
 X["random_num"] = rng.randn(X.shape[0])

diff --git a/examples/linear_model/plot_poisson_regression_non_normal_loss.py b/examples/linear_model/plot_poisson_regression_non_normal_loss.py
@@ -56,7 +56,7 @@
 from sklearn.datasets import fetch_openml
 
 
-df = fetch_openml(data_id=41214, as_frame=True).frame
+df = fetch_openml(data_id=41214, as_frame=True, parser="pandas").frame
 df
 
 # %%

diff --git a/examples/linear_model/plot_sgd_early_stopping.py b/examples/linear_model/plot_sgd_early_stopping.py
@@ -59,7 +59,7 @@
 def load_mnist(n_samples=None, class_0="0", class_1="8"):
     """Load MNIST, select two classes, shuffle and return only n_samples."""
     # Load data from http://openml.org/d/554
-    mnist = fetch_openml("mnist_784", version=1, as_frame=False)
+    mnist = fetch_openml("mnist_784", version=1, as_frame=False, parser="pandas")
 
     # take only two classes for binary classification
     mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,7 +56,7 @@ @@
     from sklearn.datasets import fetch_openml
-    df = fetch_openml(data_id=41214, as_frame=True).frame
+    df = fetch_openml(data_id=41214, as_frame=True, parser="pandas").frame
     df
     # %%
@@ Expand Down @@