Skip to content

MAINT Parameters validation for sklearn.datasets.load_files #26203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Apr 25, 2023
15 changes: 14 additions & 1 deletion sklearn/datasets/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ..utils import check_random_state
from ..utils import check_pandas_support
from ..utils.fixes import _open_binary, _open_text, _read_text, _contents
from ..utils._param_validation import validate_params, Interval
from ..utils._param_validation import validate_params, Interval, StrOptions

import numpy as np

Expand Down Expand Up @@ -99,6 +99,19 @@ def _convert_data_dataframe(
return combined_df, X, y


@validate_params(
{
"container_path": [str, os.PathLike],
"description": [str, None],
"categories": [list, None],
"load_content": ["boolean"],
"shuffle": ["boolean"],
"encoding": [str, None],
"decode_error": [StrOptions({"strict", "ignore", "replace"})],
"random_state": ["random_state"],
"allowed_extensions": [list, None],
}
)
def load_files(
container_path,
*,
Expand Down
5 changes: 3 additions & 2 deletions sklearn/datasets/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,11 @@ def test_default_load_files(test_category_dir_1, test_category_dir_2, load_files
def test_load_files_w_categories_desc_and_encoding(
test_category_dir_1, test_category_dir_2, load_files_root
):
category = os.path.abspath(test_category_dir_1).split("/").pop()
category = os.path.abspath(test_category_dir_1).split(os.sep).pop()
res = load_files(
load_files_root, description="test", categories=category, encoding="utf-8"
load_files_root, description="test", categories=[category], encoding="utf-8"
)

assert len(res.filenames) == 1
assert len(res.target_names) == 1
assert res.DESCR == "test"
Expand Down
1 change: 1 addition & 0 deletions sklearn/tests/test_public_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def _check_function_param_validation(
"sklearn.datasets.load_breast_cancer",
"sklearn.datasets.load_diabetes",
"sklearn.datasets.load_digits",
"sklearn.datasets.load_files",
"sklearn.datasets.load_iris",
"sklearn.datasets.load_linnerud",
"sklearn.datasets.load_svmlight_file",
Expand Down