Skip to content

MAINT Update fetch_openml to use the auto parser by default #27802

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions asv_benchmarks/benchmarks/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ def _20newsgroups_lowdim_dataset(n_components=100, ngrams=(1, 1), dtype=np.float

@M.cache
def _mnist_dataset(dtype=np.float32):
X, y = fetch_openml(
"mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
)
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
X = X.astype(dtype, copy=False)
X = MaxAbsScaler().fit_transform(X)

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_hist_gradient_boosting_adult.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def predict(est, data_test, target_test):
print(f"predicted in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")


data = fetch_openml(data_id=179, as_frame=True, parser="pandas") # adult dataset
data = fetch_openml(data_id=179, as_frame=True) # adult dataset
X, y = data.data, data.target

# Ordinal encode the categories to use the native support available in HGBDT
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_isolation_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def print_outlier_ratio(y):
y = dataset.target

if dat == "shuttle":
dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
dataset = fetch_openml("shuttle", as_frame=False)
X = dataset.data
y = dataset.target.astype(np.int64)
X, y = sh(X, y, random_state=random_state)
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_lof.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
y = dataset.target

if dataset_name == "shuttle":
dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
dataset = fetch_openml("shuttle", as_frame=False)
X = dataset.data
y = dataset.target.astype(np.int64)
# we remove data with label 4
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def load_data(dtype=np.float32, order="F"):
######################################################################
# Load dataset
print("Loading dataset...")
data = fetch_openml("mnist_784", as_frame=True, parser="pandas")
data = fetch_openml("mnist_784", as_frame=True)
X = check_array(data["data"], dtype=dtype, order=order)
y = data["target"]

Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_plot_randomized_svd.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def get_data(dataset_name):
del row
del col
else:
X = fetch_openml(dataset_name, parser="auto").data
X = fetch_openml(dataset_name).data
return X


Expand Down
2 changes: 1 addition & 1 deletion benchmarks/bench_tsne_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
def load_data(dtype=np.float32, order="C", shuffle=True, seed=0):
"""Load the data, then cache and memmap the train/test split"""
print("Loading dataset...")
data = fetch_openml("mnist_784", as_frame=True, parser="pandas")
data = fetch_openml("mnist_784", as_frame=True)

X = check_array(data["data"], dtype=dtype, order=order)
y = data["target"]
Expand Down
14 changes: 7 additions & 7 deletions doc/datasets/loading_other_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ from the repository using the function
For example, to download a dataset of gene expressions in mice brains::

>>> from sklearn.datasets import fetch_openml
>>> mice = fetch_openml(name='miceprotein', version=4, parser="auto")
>>> mice = fetch_openml(name='miceprotein', version=4)

To fully specify a dataset, you need to provide a name and a version, though
the version is optional, see :ref:`openml_versions` below.
Expand Down Expand Up @@ -147,7 +147,7 @@ dataset on the openml website::

The ``data_id`` also uniquely identifies a dataset from OpenML::

>>> mice = fetch_openml(data_id=40966, parser="auto")
>>> mice = fetch_openml(data_id=40966)
>>> mice.details # doctest: +SKIP
{'id': '4550', 'name': 'MiceProtein', 'version': '1', 'format': 'ARFF',
'creator': ...,
Expand All @@ -171,7 +171,7 @@ which can contain entirely different datasets.
If a particular version of a dataset has been found to contain significant
issues, it might be deactivated. Using a name to specify a dataset will yield
the earliest version of a dataset that is still active. That means that
``fetch_openml(name="miceprotein", parser="auto")`` can yield different results
``fetch_openml(name="miceprotein")`` can yield different results
at different times if earlier versions become inactive.
You can see that the dataset with ``data_id`` 40966 that we fetched above is
the first version of the "miceprotein" dataset::
Expand All @@ -182,19 +182,19 @@ the first version of the "miceprotein" dataset::
In fact, this dataset only has one version. The iris dataset on the other hand
has multiple versions::

>>> iris = fetch_openml(name="iris", parser="auto")
>>> iris = fetch_openml(name="iris")
>>> iris.details['version'] #doctest: +SKIP
'1'
>>> iris.details['id'] #doctest: +SKIP
'61'

>>> iris_61 = fetch_openml(data_id=61, parser="auto")
>>> iris_61 = fetch_openml(data_id=61)
>>> iris_61.details['version']
'1'
>>> iris_61.details['id']
'61'

>>> iris_969 = fetch_openml(data_id=969, parser="auto")
>>> iris_969 = fetch_openml(data_id=969)
>>> iris_969.details['version']
'3'
>>> iris_969.details['id']
Expand All @@ -212,7 +212,7 @@ binarized version of the data::
You can also specify both the name and the version, which also uniquely
identifies the dataset::

>>> iris_version_3 = fetch_openml(name="iris", version=3, parser="auto")
>>> iris_version_3 = fetch_openml(name="iris", version=3)
>>> iris_version_3.details['version']
'3'
>>> iris_version_3.details['id']
Expand Down
4 changes: 1 addition & 3 deletions examples/applications/plot_cyclical_feature_engineering.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
# We start by loading the data from the OpenML repository.
from sklearn.datasets import fetch_openml

bike_sharing = fetch_openml(
"Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas"
)
bike_sharing = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
df = bike_sharing.frame

# %%
Expand Down
2 changes: 1 addition & 1 deletion examples/applications/plot_digits_denoising.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True, parser="pandas")
X, y = fetch_openml(data_id=41082, as_frame=False, return_X_y=True)
X = MinMaxScaler().fit_transform(X)

# %%
Expand Down
4 changes: 1 addition & 3 deletions examples/compose/plot_column_transformer_mixed_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,7 @@

# %%
# Load data from https://www.openml.org/d/40945
X, y = fetch_openml(
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)

# Alternatively X and y can be obtained directly from the frame attribute:
# X = titanic.frame.drop('survived', axis=1)
Expand Down
2 changes: 1 addition & 1 deletion examples/compose/plot_transformed_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def compute_score(y_true, y_pred):
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import quantile_transform

ames = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
ames = fetch_openml(name="house_prices", as_frame=True)
# Keep only numeric columns
X = ames.data.select_dtypes(np.number)
# Remove columns with NaN or Inf values
Expand Down
2 changes: 1 addition & 1 deletion examples/ensemble/plot_gradient_boosting_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
# are either categorical or numerical:
from sklearn.datasets import fetch_openml

X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True, parser="pandas")
X, y = fetch_openml(data_id=42165, as_frame=True, return_X_y=True)

# Select only a subset of features of X to make the example faster to run
categorical_columns_subset = [
Expand Down
2 changes: 1 addition & 1 deletion examples/ensemble/plot_stack_predictors.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@


def load_ames_housing():
df = fetch_openml(name="house_prices", as_frame=True, parser="pandas")
df = fetch_openml(name="house_prices", as_frame=True)
X = df.data
y = df.target

Expand Down
2 changes: 1 addition & 1 deletion examples/gaussian_process/plot_gpr_co2.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
# in OpenML.
from sklearn.datasets import fetch_openml

co2 = fetch_openml(data_id=41187, as_frame=True, parser="pandas")
co2 = fetch_openml(data_id=41187, as_frame=True)
co2.frame.head()

# %%
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# as a pandas dataframe.
from sklearn.datasets import fetch_openml

survey = fetch_openml(data_id=534, as_frame=True, parser="pandas")
survey = fetch_openml(data_id=534, as_frame=True)

# %%
# Then, we identify features `X` and targets `y`: the column WAGE is our
Expand Down
2 changes: 1 addition & 1 deletion examples/inspection/plot_partial_dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
# rentals using weather and season data as well as the datetime information.
from sklearn.datasets import fetch_openml

bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True, parser="pandas")
bikes = fetch_openml("Bike_Sharing_Demand", version=2, as_frame=True)
# Make an explicit copy to avoid "SettingWithCopyWarning" from pandas
X, y = bikes.data.copy(), bikes.target

Expand Down
4 changes: 1 addition & 3 deletions examples/inspection/plot_permutation_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

X, y = fetch_openml(
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
rng = np.random.RandomState(seed=42)
X["random_cat"] = rng.randint(3, size=X.shape[0])
X["random_num"] = rng.randn(X.shape[0])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@
# https://www.openml.org/d/41214
from sklearn.datasets import fetch_openml

df = fetch_openml(data_id=41214, as_frame=True, parser="pandas").frame
df = fetch_openml(data_id=41214, as_frame=True).frame
df

# %%
Expand Down
2 changes: 1 addition & 1 deletion examples/linear_model/plot_sgd_early_stopping.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
def load_mnist(n_samples=None, class_0="0", class_1="8"):
"""Load MNIST, select two classes, shuffle and return only n_samples."""
# Load data from http://openml.org/d/554
mnist = fetch_openml("mnist_784", version=1, as_frame=False, parser="pandas")
mnist = fetch_openml("mnist_784", version=1, as_frame=False)

# take only two classes for binary classification
mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@
train_samples = 5000

# Load data from https://www.openml.org/d/554
X, y = fetch_openml(
"mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
)
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)

random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,12 +68,12 @@ def load_mtpl2(n_samples=None):
678013 samples.
"""
# freMTPL2freq dataset from https://www.openml.org/d/41214
df_freq = fetch_openml(data_id=41214, as_frame=True, parser="pandas").data
df_freq = fetch_openml(data_id=41214, as_frame=True).data
df_freq["IDpol"] = df_freq["IDpol"].astype(int)
df_freq.set_index("IDpol", inplace=True)

# freMTPL2sev dataset from https://www.openml.org/d/41215
df_sev = fetch_openml(data_id=41215, as_frame=True, parser="pandas").data
df_sev = fetch_openml(data_id=41215, as_frame=True).data

# sum ClaimAmount over identical IDs
df_sev = df_sev.groupby("IDpol").sum()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X, y = fetch_openml(data_id=1464, return_X_y=True, parser="pandas")
X, y = fetch_openml(data_id=1464, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

clf = make_pipeline(StandardScaler(), LogisticRegression(random_state=0))
Expand Down
8 changes: 2 additions & 6 deletions examples/miscellaneous/plot_outlier_detection_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,7 @@ def fit_predict(estimator, X):

from sklearn.datasets import fetch_openml

X, y = fetch_openml(
name="ames_housing", version=1, return_X_y=True, as_frame=True, parser="pandas"
)
X, y = fetch_openml(name="ames_housing", version=1, return_X_y=True, as_frame=True)
y = y.div(X["Lot_Area"])

# None values in pandas 1.5.1 were mapped to np.nan in pandas 2.0.1
Expand Down Expand Up @@ -256,9 +254,7 @@ def fit_predict(estimator, X):
# which are binary encoded and some are continuous.

# %%
X, y = fetch_openml(
name="cardiotocography", version=1, return_X_y=True, as_frame=False, parser="pandas"
)
X, y = fetch_openml(name="cardiotocography", version=1, return_X_y=True, as_frame=False)
X_cardiotocography = X # save X for later use
s = y == "3"
y = s.astype(np.int32)
Expand Down
4 changes: 1 addition & 3 deletions examples/miscellaneous/plot_set_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,7 @@
# :class:`compose.ColumnTransformer` and heterogeneous data.
from sklearn.datasets import fetch_openml

X, y = fetch_openml(
"titanic", version=1, as_frame=True, return_X_y=True, parser="pandas"
)
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

# %%
Expand Down
2 changes: 1 addition & 1 deletion examples/multiclass/plot_multiclass_overview.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
# the dataset from OpenML.
from sklearn.datasets import fetch_openml

X, y = fetch_openml(data_id=181, as_frame=True, return_X_y=True, parser="pandas")
X, y = fetch_openml(data_id=181, as_frame=True, return_X_y=True)

# %%
# To know the type of data science problem we are dealing with, we can check
Expand Down
2 changes: 1 addition & 1 deletion examples/multioutput/plot_classifier_chain_yeast.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from sklearn.model_selection import train_test_split

# Load a multi-label dataset from https://www.openml.org/d/40597
X, Y = fetch_openml("yeast", version=4, return_X_y=True, parser="pandas")
X, Y = fetch_openml("yeast", version=4, return_X_y=True)
Y = Y == "TRUE"
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

Expand Down
2 changes: 1 addition & 1 deletion examples/neighbors/approximate_nearest_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def transform(self, X):

def load_mnist(n_samples):
"""Load MNIST, shuffle the data, and return only n_samples."""
mnist = fetch_openml("mnist_784", as_frame=False, parser="pandas")
mnist = fetch_openml("mnist_784", as_frame=False)
X, y = shuffle(mnist.data, mnist.target, random_state=2)
return X[:n_samples] / 255, y[:n_samples]

Expand Down
4 changes: 1 addition & 3 deletions examples/neural_networks/plot_mnist_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@
from sklearn.neural_network import MLPClassifier

# Load data from https://www.openml.org/d/554
X, y = fetch_openml(
"mnist_784", version=1, return_X_y=True, as_frame=False, parser="pandas"
)
X, y = fetch_openml("mnist_784", version=1, return_X_y=True, as_frame=False)
X = X / 255.0

# Split data into train partition and test partition
Expand Down
2 changes: 1 addition & 1 deletion examples/preprocessing/plot_target_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# be a reviewer:
from sklearn.datasets import fetch_openml

wine_reviews = fetch_openml(data_id=42074, as_frame=True, parser="pandas")
wine_reviews = fetch_openml(data_id=42074, as_frame=True)

df = wine_reviews.frame
df.head()
Expand Down
Loading