Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/upcoming_changes/sklearn.datasets/31685.fix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- Fixed a regression preventing to extract the downloaded dataset in
:func:`datasets.fetch_20newsgroups`, :func:`datasets.fetch_20newsgroups_vectorized`,
:func:`datasets.fetch_lfw_people` and :func:`datasets.fetch_lfw_pairs`. This
only affects Python versions `>=3.10.0,<=3.10.11` and `>=3.11.0,<=3.11.3`.
By :user:`Jérémie du Boisberranger <jeremiedbb>`.
6 changes: 2 additions & 4 deletions sklearn/datasets/_lfw.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from ..utils import Bunch
from ..utils._param_validation import Hidden, Interval, StrOptions, validate_params
from ..utils.fixes import tarfile_extractall
from ._base import (
RemoteFileMetadata,
_fetch_remote,
Expand Down Expand Up @@ -117,10 +118,7 @@

logger.debug("Decompressing the data archive to %s", data_folder_path)
with tarfile.open(archive_path, "r:gz") as fp:
# Use filter="data" to prevent the most dangerous security issues.
# For more details, see
# https://docs.python.org/3.9/library/tarfile.html#tarfile.TarFile.extractall
fp.extractall(path=lfw_home, filter="data")
tarfile_extractall(fp, path=lfw_home)

Check warning on line 121 in sklearn/datasets/_lfw.py

View check run for this annotation

Codecov / codecov/patch

sklearn/datasets/_lfw.py#L121

Added line #L121 was not covered by tests

remove(archive_path)

Expand Down
6 changes: 2 additions & 4 deletions sklearn/datasets/_twenty_newsgroups.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from ..feature_extraction.text import CountVectorizer
from ..utils import Bunch, check_random_state
from ..utils._param_validation import Interval, StrOptions, validate_params
from ..utils.fixes import tarfile_extractall
from . import get_data_home, load_files
from ._base import (
RemoteFileMetadata,
Expand Down Expand Up @@ -81,10 +82,7 @@

logger.debug("Decompressing %s", archive_path)
with tarfile.open(archive_path, "r:gz") as fp:
# Use filter="data" to prevent the most dangerous security issues.
# For more details, see
# https://docs.python.org/3.9/library/tarfile.html#tarfile.TarFile.extractall
fp.extractall(path=target_dir, filter="data")
tarfile_extractall(fp, path=target_dir)

Check warning on line 85 in sklearn/datasets/_twenty_newsgroups.py

View check run for this annotation

Codecov / codecov/patch

sklearn/datasets/_twenty_newsgroups.py#L85

Added line #L85 was not covered by tests

with suppress(FileNotFoundError):
os.remove(archive_path)
Expand Down
11 changes: 11 additions & 0 deletions sklearn/utils/fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,17 @@
)


# TODO: Remove when Python min version >= 3.12.
def tarfile_extractall(tarfile, path):
try:

Check warning on line 366 in sklearn/utils/fixes.py

View check run for this annotation

Codecov / codecov/patch

sklearn/utils/fixes.py#L366

Added line #L366 was not covered by tests
# Use filter="data" to prevent the most dangerous security issues.
# For more details, see
# https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractall
tarfile.extractall(path, filter="data")
except TypeError:
tarfile.extractall(path)

Check warning on line 372 in sklearn/utils/fixes.py

View check run for this annotation

Codecov / codecov/patch

sklearn/utils/fixes.py#L370-L372

Added lines #L370 - L372 were not covered by tests


def _in_unstable_openblas_configuration():
"""Return True if in an unstable configuration for OpenBLAS"""

Expand Down