Skip to content

Commit d92fba7

Browse files
authored
FIX add retry mechanism to handle quotechar in read_csv (#25511)
1 parent eae3f29 commit d92fba7

File tree

9 files changed

+116
-30
lines changed

9 files changed

+116
-30
lines changed

doc/whats_new/v1.2.rst

+5-1
Original file line numberDiff line numberDiff line change
@@ -146,11 +146,15 @@ Changelog
146146
:mod:`sklearn.datasets`
147147
.......................
148148

149-
- |Fix| Fix an inconsistency in :func:`datasets.fetch_openml` between liac-arff
149+
- |Fix| Fixes an inconsistency in :func:`datasets.fetch_openml` between liac-arff
150150
and pandas parser when a leading space is introduced after the delimiter.
151151
The ARFF specs requires to ignore the leading space.
152152
:pr:`25312` by :user:`Guillaume Lemaitre <glemaitre>`.
153153

154+
- |Fix| Fixes a bug in :func:`datasets.fetch_openml` when using `parser="pandas"`
155+
where single quote and backslash escape characters were not properly handled.
156+
:pr:`25511` by :user:`Guillaume Lemaitre <glemaitre>`.
157+
154158
:mod:`sklearn.decomposition`
155159
............................
156160

sklearn/datasets/_arff_parser.py

+41-11
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@ def _pandas_arff_parser(
302302
openml_columns_info,
303303
feature_names_to_select,
304304
target_names_to_select,
305+
read_csv_kwargs=None,
305306
):
306307
"""ARFF parser using `pandas.read_csv`.
307308
@@ -331,6 +332,10 @@ def _pandas_arff_parser(
331332
target_names_to_select : list of str
332333
A list of the target names to be selected to build `y`.
333334
335+
read_csv_kwargs : dict, default=None
336+
Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
337+
the default options.
338+
334339
Returns
335340
-------
336341
X : {ndarray, sparse matrix, dataframe}
@@ -363,18 +368,37 @@ def _pandas_arff_parser(
363368
dtypes[name] = "Int64"
364369
elif column_dtype.lower() == "nominal":
365370
dtypes[name] = "category"
371+
# since we will not pass `names` when reading the ARFF file, we need to translate
372+
# `dtypes` from column names to column indices to pass to `pandas.read_csv`
373+
dtypes_positional = {
374+
col_idx: dtypes[name]
375+
for col_idx, name in enumerate(openml_columns_info)
376+
if name in dtypes
377+
}
366378

367-
# ARFF represents missing values with "?"
368-
frame = pd.read_csv(
369-
gzip_file,
370-
header=None,
371-
na_values=["?"], # missing values are represented by `?`
372-
comment="%", # skip line starting by `%` since they are comments
373-
quotechar='"', # delimiter to use for quoted strings
374-
names=[name for name in openml_columns_info],
375-
dtype=dtypes,
376-
skipinitialspace=True, # skip spaces after delimiter to follow ARFF specs
377-
)
379+
default_read_csv_kwargs = {
380+
"header": None,
381+
"index_col": False, # always force pandas to not use the first column as index
382+
"na_values": ["?"], # missing values are represented by `?`
383+
"comment": "%", # skip line starting by `%` since they are comments
384+
"quotechar": '"', # delimiter to use for quoted strings
385+
"skipinitialspace": True, # skip spaces after delimiter to follow ARFF specs
386+
"escapechar": "\\",
387+
"dtype": dtypes_positional,
388+
}
389+
read_csv_kwargs = {**default_read_csv_kwargs, **(read_csv_kwargs or {})}
390+
frame = pd.read_csv(gzip_file, **read_csv_kwargs)
391+
try:
392+
# Setting the columns while reading the file will select the N first columns
393+
# and not raise a ParserError. Instead, we set the columns after reading the
394+
# file and raise a ParserError if the number of columns does not match the
395+
# number of columns in the metadata given by OpenML.
396+
frame.columns = [name for name in openml_columns_info]
397+
except ValueError as exc:
398+
raise pd.errors.ParserError(
399+
"The number of columns provided by OpenML does not match the number of "
400+
"columns inferred by pandas when reading the file."
401+
) from exc
378402

379403
columns_to_select = feature_names_to_select + target_names_to_select
380404
columns_to_keep = [col for col in frame.columns if col in columns_to_select]
@@ -431,6 +455,7 @@ def load_arff_from_gzip_file(
431455
feature_names_to_select,
432456
target_names_to_select,
433457
shape=None,
458+
read_csv_kwargs=None,
434459
):
435460
"""Load a compressed ARFF file using a given parser.
436461
@@ -461,6 +486,10 @@ def load_arff_from_gzip_file(
461486
target_names_to_select : list of str
462487
A list of the target names to be selected.
463488
489+
read_csv_kwargs : dict, default=None
490+
Keyword arguments to pass to `pandas.read_csv`. It allows to overwrite
491+
the default options.
492+
464493
Returns
465494
-------
466495
X : {ndarray, sparse matrix, dataframe}
@@ -493,6 +522,7 @@ def load_arff_from_gzip_file(
493522
openml_columns_info,
494523
feature_names_to_select,
495524
target_names_to_select,
525+
read_csv_kwargs,
496526
)
497527
else:
498528
raise ValueError(

sklearn/datasets/_openml.py

+54-18
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,15 @@ def _get_local_path(openml_path: str, data_home: str) -> str:
3737
return os.path.join(data_home, "openml.org", openml_path + ".gz")
3838

3939

40-
def _retry_with_clean_cache(openml_path: str, data_home: Optional[str]) -> Callable:
40+
def _retry_with_clean_cache(
41+
openml_path: str,
42+
data_home: Optional[str],
43+
no_retry_exception: Optional[Exception] = None,
44+
) -> Callable:
4145
"""If the first call to the decorated function fails, the local cached
4246
file is removed, and the function is called again. If ``data_home`` is
43-
``None``, then the function is called once.
47+
``None``, then the function is called once. We can provide a specific
48+
exception to not retry on usign `no_retry_exception` parameter.
4449
"""
4550

4651
def decorator(f):
@@ -52,7 +57,11 @@ def wrapper(*args, **kw):
5257
return f(*args, **kw)
5358
except URLError:
5459
raise
55-
except Exception:
60+
except Exception as exc:
61+
if no_retry_exception is not None and isinstance(
62+
exc, no_retry_exception
63+
):
64+
raise
5665
warn("Invalid cache, redownloading file", RuntimeWarning)
5766
local_path = _get_local_path(openml_path, data_home)
5867
if os.path.exists(local_path):
@@ -216,7 +225,7 @@ def _get_json_content_from_openml_api(
216225
An exception otherwise.
217226
"""
218227

219-
@_retry_with_clean_cache(url, data_home)
228+
@_retry_with_clean_cache(url, data_home=data_home)
220229
def _load_json():
221230
with closing(
222231
_open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
@@ -492,20 +501,39 @@ def _load_arff_response(
492501
"and retry..."
493502
)
494503

495-
gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
496-
with closing(gzip_file):
504+
def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
505+
gzip_file = _open_openml_url(url, data_home, n_retries=n_retries, delay=delay)
506+
with closing(gzip_file):
507+
return load_arff_from_gzip_file(gzip_file, **arff_params)
497508

498-
X, y, frame, categories = load_arff_from_gzip_file(
499-
gzip_file,
500-
parser=parser,
501-
output_type=output_type,
502-
openml_columns_info=openml_columns_info,
503-
feature_names_to_select=feature_names_to_select,
504-
target_names_to_select=target_names_to_select,
505-
shape=shape,
509+
arff_params = dict(
510+
parser=parser,
511+
output_type=output_type,
512+
openml_columns_info=openml_columns_info,
513+
feature_names_to_select=feature_names_to_select,
514+
target_names_to_select=target_names_to_select,
515+
shape=shape,
516+
)
517+
try:
518+
X, y, frame, categories = _open_url_and_load_gzip_file(
519+
url, data_home, n_retries, delay, arff_params
506520
)
521+
except Exception as exc:
522+
if parser == "pandas":
523+
from pandas.errors import ParserError
524+
525+
if isinstance(exc, ParserError):
526+
# A parsing error could come from providing the wrong quotechar
527+
# to pandas. By default, we use a double quote. Thus, we retry
528+
# with a single quote before to raise the error.
529+
arff_params["read_csv_kwargs"] = {"quotechar": "'"}
530+
X, y, frame, categories = _open_url_and_load_gzip_file(
531+
url, data_home, n_retries, delay, arff_params
532+
)
533+
else:
534+
raise
507535

508-
return X, y, frame, categories
536+
return X, y, frame, categories
509537

510538

511539
def _download_data_to_bunch(
@@ -605,9 +633,17 @@ def _download_data_to_bunch(
605633
"values. Missing values are not supported for target columns."
606634
)
607635

608-
X, y, frame, categories = _retry_with_clean_cache(url, data_home)(
609-
_load_arff_response
610-
)(
636+
no_retry_exception = None
637+
if parser == "pandas":
638+
# If we get a ParserError with pandas, then we don't want to retry and we raise
639+
# early.
640+
from pandas.errors import ParserError
641+
642+
no_retry_exception = ParserError
643+
644+
X, y, frame, categories = _retry_with_clean_cache(
645+
url, data_home, no_retry_exception
646+
)(_load_arff_response)(
611647
url,
612648
data_home,
613649
parser=parser,

sklearn/datasets/tests/data/openml/id_42074/__init__.py

Whitespace-only changes.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

sklearn/datasets/tests/test_openml.py

+16
Original file line numberDiff line numberDiff line change
@@ -1617,6 +1617,22 @@ def test_fetch_openml_leading_whitespace(monkeypatch):
16171617
)
16181618

16191619

1620+
def test_fetch_openml_quotechar_escapechar(monkeypatch):
1621+
"""Check that we can handle escapechar and single/double quotechar.
1622+
1623+
Non-regression test for:
1624+
https://github.com/scikit-learn/scikit-learn/issues/25478
1625+
"""
1626+
pd = pytest.importorskip("pandas")
1627+
data_id = 42074
1628+
_monkey_patch_webbased_functions(monkeypatch, data_id=data_id, gzip_response=False)
1629+
1630+
common_params = {"as_frame": True, "cache": False, "data_id": data_id}
1631+
adult_pandas = fetch_openml(parser="pandas", **common_params)
1632+
adult_liac_arff = fetch_openml(parser="liac-arff", **common_params)
1633+
pd.testing.assert_frame_equal(adult_pandas.frame, adult_liac_arff.frame)
1634+
1635+
16201636
###############################################################################
16211637
# Deprecation-changed parameters
16221638

0 commit comments

Comments
 (0)