From 075d7e3ef51479bfddacb733a663c66dbf76126c Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 30 Jun 2020 22:45:56 -0400 Subject: [PATCH 01/38] WIP Adds typing support for automatic docstring generation --- sklearn/_build_utils/min_dependencies.py | 1 + sklearn/base.py | 16 +++ sklearn/linear_model/_logistic.py | 78 ++++++++++----- sklearn/utils/_typing.py | 120 +++++++++++++++++++++++ 4 files changed, 189 insertions(+), 26 deletions(-) create mode 100644 sklearn/utils/_typing.py diff --git a/sklearn/_build_utils/min_dependencies.py b/sklearn/_build_utils/min_dependencies.py index 67fef99880e4a..f663c4c5463e4 100644 --- a/sklearn/_build_utils/min_dependencies.py +++ b/sklearn/_build_utils/min_dependencies.py @@ -40,6 +40,7 @@ 'sphinx-gallery': ('0.7.0', 'docs'), 'numpydoc': ('1.0.0', 'docs'), 'Pillow': ('7.1.2', 'docs'), + 'typing-extensions': ('3.7.4.2', 'build, install') } diff --git a/sklearn/base.py b/sklearn/base.py index 46398baabfd3a..9edd5e368cafe 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -19,6 +19,8 @@ from .utils.validation import check_array from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args +from .utils._typing import add_types_to_docstring + _DEFAULT_TAGS = { 'non_deterministic': False, @@ -155,6 +157,20 @@ class BaseEstimator: arguments (no ``*args`` or ``**kwargs``). """ + def __init_subclass__(cls, inject_docstring=False, **kwargs): + if inject_docstring: + all_annotations = {} + if hasattr(cls, '__annotations__'): + all_annotations.update(cls.__annotations__) + if hasattr(cls.__init__, '__annotations__'): + all_annotations.update(cls.__init__.__annotations__) + params = inspect.signature(cls.__init__).parameters + defaults = {p: v.default for p, v in params.items() + if v.default != inspect.Parameter.empty} + cls.__doc__ = add_types_to_docstring(cls.__doc__, + all_annotations, + defaults) + @classmethod def _get_param_names(cls): """Get parameter names for the estimator""" diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index c9451c749aaea..dd460683fe158 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -18,6 +18,12 @@ from scipy.special import expit, logsumexp from joblib import Parallel, delayed, effective_n_jobs +from ..utils._typing import Literal +from ..utils._typing import RandomState +from ..utils._typing import Annotated +from ..utils._typing import Shape +from typing import Union +from typing import Optional from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator from ._sag import sag_solver from ..preprocessing import LabelEncoder, LabelBinarizer @@ -1009,7 +1015,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, class LogisticRegression(BaseEstimator, LinearClassifierMixin, - SparseCoefMixin): + SparseCoefMixin, inject_docstring=True): """ Logistic Regression (aka logit, MaxEnt) classifier. @@ -1036,7 +1042,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, Parameters ---------- - penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2' + penalty : Used to specify the norm used in the penalization. The 'newton-cg', 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is only supported by the 'saga' solver. If 'none' (not supported by the @@ -1045,24 +1051,24 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) - dual : bool, default=False + dual : Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. - tol : float, default=1e-4 + tol : Tolerance for stopping criteria. - C : float, default=1.0 + C : Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. - fit_intercept : bool, default=True + fit_intercept : Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. - intercept_scaling : float, default=1 + intercept_scaling : Useful only when the solver 'liblinear' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], @@ -1075,7 +1081,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. - class_weight : dict or 'balanced', default=None + class_weight : Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -1089,12 +1095,11 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionadded:: 0.17 *class_weight='balanced'* - random_state : int, RandomState instance, default=None + random_state : Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details. - solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ - default='lbfgs' + solver : Algorithm to use in the optimization problem. @@ -1119,10 +1124,10 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionchanged:: 0.22 The default solver changed from 'liblinear' to 'lbfgs' in 0.22. - max_iter : int, default=100 + max_iter : Maximum number of iterations taken for the solvers to converge. - multi_class : {'auto', 'ovr', 'multinomial'}, default='auto' + multi_class : If the option chosen is 'ovr', then a binary problem is fit for each label. For 'multinomial' the loss minimised is the multinomial loss fit across the entire probability distribution, *even when the data is @@ -1135,11 +1140,11 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionchanged:: 0.22 Default changed from 'ovr' to 'auto' in 0.22. - verbose : int, default=0 + verbose : For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. - warm_start : bool, default=False + warm_start : When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See :term:`the Glossary `. @@ -1147,7 +1152,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, .. versionadded:: 0.17 *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers. - n_jobs : int, default=None + n_jobs : Number of CPU cores used when parallelizing over classes if multi_class='ovr'". This parameter is ignored when the ``solver`` is set to 'liblinear' regardless of whether 'multi_class' is specified or @@ -1155,7 +1160,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - l1_ratio : float, default=None + l1_ratio : The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent @@ -1165,17 +1170,17 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, Attributes ---------- - classes_ : ndarray of shape (n_classes, ) + classes_ : A list of class labels known to the classifier. - coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) + coef_ : Coefficient of the features in the decision function. `coef_` is of shape (1, n_features) when the given problem is binary. In particular, when `multi_class='multinomial'`, `coef_` corresponds to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False). - intercept_ : ndarray of shape (1,) or (n_classes,) + intercept_ : Intercept (a.k.a. bias) added to the decision function. If `fit_intercept` is set to False, the intercept is set to zero. @@ -1184,7 +1189,7 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, corresponds to outcome 1 (True) and `-intercept_` corresponds to outcome 0 (False). - n_iter_ : ndarray of shape (n_classes,) or (1, ) + n_iter_ : Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum number of iteration across all classes is given. @@ -1249,12 +1254,33 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, >>> clf.score(X, y) 0.97... """ + classes_: Annotated[np.ndarray, Shape(("n_classes",))] + coef_: Annotated[np.ndarray, Shape((1, "n_features"), + ("n_classes", "n_features"))] + intercept_: Annotated[np.ndarray, Shape((1,), + ("n_classes",))] + n_iter_: Annotated[np.ndarray, Shape(("n_classes",), + (1,))] + @_deprecate_positional_args - def __init__(self, penalty='l2', *, dual=False, tol=1e-4, C=1.0, - fit_intercept=True, intercept_scaling=1, class_weight=None, - random_state=None, solver='lbfgs', max_iter=100, - multi_class='auto', verbose=0, warm_start=False, n_jobs=None, - l1_ratio=None): + def __init__(self, + penalty: Literal['l1', 'l2', 'elasticnet', 'none'] = 'l2', + *, + dual: bool = False, + tol: float = 1e-4, + C: float = 1.0, + fit_intercept: bool = True, + intercept_scaling: float = 1, + class_weight: Union[dict, Literal['balanced']] = None, + random_state: RandomState = None, + solver: Literal['newton-cg', 'lbfgs', 'liblinear', 'sag', + 'saga'] = 'lbfgs', + max_iter: int = 100, + multi_class: Literal['auto', 'ovr', 'multinomial'] = 'auto', + verbose: int = 0, + warm_start: bool = False, + n_jobs: Optional[int] = None, + l1_ratio: float = None): self.penalty = penalty self.dual = dual diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py new file mode 100644 index 0000000000000..b1209d280c449 --- /dev/null +++ b/sklearn/utils/_typing.py @@ -0,0 +1,120 @@ +import re +import numpy as np + +from typing import Union +try: + from typing import Literal +except ImportError: + from typing_extensions import Literal # noqa + +try: + from typing import Annotated # python 3.9 +except ImportError: + from typing_extensions import Annotated # noqa + + +RandomState = Union[int, np.random.RandomState, None] + + +class Shape: + def __init__(self, *shapes): + self.shapes = [] + for shape in shapes: + if not isinstance(shape, (tuple, list)): + self.shapes.append((shape, )) + else: + self.shapes.append(shape) + + def _join_one(self, shape): + if len(shape) == 1: + return f"({shape[0]},)" + inner = ', '.join(str(s) for s in shape) + return f"({inner})" + + def __repr__(self): + output = ' or '.join(self._join_one(shape) for shape in self.shapes) + return f"of shape {output}" + + +def format_annotation(annotation): + """Convert annotation to docstring""" + if annotation is None or annotation is type(None): # noqa + return 'None' + + if annotation in [int, bool, float, str, dict, np.ndarray]: + return annotation.__qualname__ + + if hasattr(annotation, '__name__'): + name = annotation.__name__ + if name == 'BaseEstimator': + return 'estimator instance' + elif name == 'RandomState': + return 'int, RandomState instance, or None' + elif name == 'ArrayLike': + return 'array-like' + + if hasattr(annotation, '__origin__'): + origin = annotation.__origin__ + if hasattr(annotation, '__metadata__'): # Annotated + metadata = ', '.join(str(t) for t in annotation.__metadata__) + type_info = format_annotation(origin) + return f'{type_info} {metadata}' + + if getattr(origin, '__qualname__', None): + name = origin.__qualname__ + elif getattr(origin, '_name', None): + # Required for Union on Python 3.7+ + name = origin._name + else: + # Required for Union on Python < 3.7 + name = origin.__class__.__qualname__.lstrip('_') + + if name == 'Union': + values = [format_annotation(t) for t in annotation.__args__] + if len(values) == 2: + return ' or '.join(values) + # greater than 2 + first = ', '.join(values[:-1]) + return f'{first}, or {values[-1]}' + + elif name == "Literal": + values = ', '.join(format_annotation(t) + for t in annotation.__args__) + return f'{{{values}}}' + elif name == 'list': + values = ', '.join(format_annotation(t) + for t in annotation.__args__) + return f'list of {values}' + + return repr(annotation) + + +def add_types_to_docstring(docstring, annotations, defaults): + + indent_regex = r"^( +)Parameters\s*\n +[-=]{10}" + indent_match = re.search(indent_regex, docstring, flags=re.MULTILINE) + if not indent_match: + return docstring + n_indent = len(indent_match.group(1)) + + indent = " " * n_indent + param_regex = re.compile(f"{indent}(\\w+) :") + lines = docstring.split('\n') + + for lineno, line in enumerate(lines): + found_param = param_regex.match(line) + if not found_param: + continue + name = found_param.group(1) + + if name not in annotations: + continue + + annotation = annotations[name] + type_str = format_annotation(annotation) + new_line = f"{indent}{name} : {type_str}" + if name in defaults: + new_line += f", (default={defaults[name]})" + lines[lineno] = new_line + + return "\n".join(lines) From 6b35ee1e973b58f63bdf0e4eea3ecab391d7b6f4 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 30 Jun 2020 23:13:56 -0400 Subject: [PATCH 02/38] WIP Temporary fix --- sklearn/linear_model/_logistic.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index dd460683fe158..24d4b7cec245c 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1254,13 +1254,12 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, >>> clf.score(X, y) 0.97... """ - classes_: Annotated[np.ndarray, Shape(("n_classes",))] + classes_: Annotated[np.ndarray, Shape(("n_classes",))] # noqa coef_: Annotated[np.ndarray, Shape((1, "n_features"), - ("n_classes", "n_features"))] + ("n_classes", "n_features"))] # noqa intercept_: Annotated[np.ndarray, Shape((1,), - ("n_classes",))] - n_iter_: Annotated[np.ndarray, Shape(("n_classes",), - (1,))] + ("n_classes",))] # noqa + n_iter_: Annotated[np.ndarray, Shape(("n_classes",), (1,))] # noqa @_deprecate_positional_args def __init__(self, From e1584406c701096c9cff83bc13c2dcc61c7bac84 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 30 Jun 2020 23:20:05 -0400 Subject: [PATCH 03/38] WIP Temporary fix --- sklearn/linear_model/_logistic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 24d4b7cec245c..4d32aa229a83f 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1255,9 +1255,9 @@ class LogisticRegression(BaseEstimator, LinearClassifierMixin, 0.97... """ classes_: Annotated[np.ndarray, Shape(("n_classes",))] # noqa - coef_: Annotated[np.ndarray, Shape((1, "n_features"), + coef_: Annotated[np.ndarray, Shape((1, "n_features"), # noqa ("n_classes", "n_features"))] # noqa - intercept_: Annotated[np.ndarray, Shape((1,), + intercept_: Annotated[np.ndarray, Shape((1,), # noqa ("n_classes",))] # noqa n_iter_: Annotated[np.ndarray, Shape(("n_classes",), (1,))] # noqa From 4a925e1f7bc6fc32ae823f065cf61b3b0ef2c15a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 30 Jun 2020 23:25:42 -0400 Subject: [PATCH 04/38] WIP Install typing extensions for linting --- azure-pipelines.yml | 2 +- sklearn/utils/_typing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index b58283b718bd1..a5914e2ab3d49 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -17,7 +17,7 @@ jobs: inputs: versionSpec: '3.8' - bash: | - pip install flake8 mypy==0.780 + pip install flake8 mypy==0.780 typing-extensions displayName: Install linters - bash: | set -ex diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index b1209d280c449..47c6e73d5a6dd 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -3,12 +3,12 @@ from typing import Union try: - from typing import Literal + from typing import Literal # noqa except ImportError: from typing_extensions import Literal # noqa try: - from typing import Annotated # python 3.9 + from typing import Annotated # noqa except ImportError: from typing_extensions import Annotated # noqa From 6cb1e792340cfe41a2bd395d8ac51f93188b45dc Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 30 Jun 2020 23:29:04 -0400 Subject: [PATCH 05/38] WIP Try again --- sklearn/utils/_typing.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 47c6e73d5a6dd..52f6a8cab6caa 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -2,15 +2,8 @@ import numpy as np from typing import Union -try: - from typing import Literal # noqa -except ImportError: - from typing_extensions import Literal # noqa - -try: - from typing import Annotated # noqa -except ImportError: - from typing_extensions import Annotated # noqa +from typing_extensions import Literal # noqa +from typing_extensions import Annotated # noqa RandomState = Union[int, np.random.RandomState, None] From 87f6b076972d530bb3151f9ae8513e7ce7a29da0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 8 Jul 2020 00:19:42 -0400 Subject: [PATCH 06/38] ENH Adds tests with docstring type generator tool --- build_tools/get_formatted_docstring_types.py | 41 +++++ sklearn/base.py | 15 -- sklearn/linear_model/_logistic.py | 54 +++--- sklearn/tests/test_common.py | 97 +++++++++++ sklearn/utils/_typing.py | 163 +++++++++---------- 5 files changed, 243 insertions(+), 127 deletions(-) create mode 100644 build_tools/get_formatted_docstring_types.py diff --git a/build_tools/get_formatted_docstring_types.py b/build_tools/get_formatted_docstring_types.py new file mode 100644 index 0000000000000..930b5c8056cbc --- /dev/null +++ b/build_tools/get_formatted_docstring_types.py @@ -0,0 +1,41 @@ +"""Helper script to get the docstring type of estimator or function.""" +from importlib import import_module +import argparse +import inspect + + +from sklearn.utils._typing import get_annotations + + +parser = argparse.ArgumentParser( + description=("Generates typed docstring for a specific scikit-learn " + "class or function")) +parser.add_argument('object', help=("scikit-learn object, for example " + "linear_model.LogisticRegression")) + +args = parser.parse_args() +object_input = args.object +object_split = object_input.split(".") + +module = "sklearn." + ".".join(object_split[:-1]) +instance_str = object_split[-1] +instance = getattr(import_module(module), instance_str) + + +print("Parameters") +print("----------") +if inspect.isclass(instance): + formatted_annotations = get_annotations(instance.__init__) +else: + formatted_annotations = get_annotations(instance) +for name, annotation in formatted_annotations.items(): + print(f"{name} : {annotation}") + + +if inspect.isclass(instance): + print() + print("Attributes") + print("----------") + formatted_annotations = get_annotations(instance) + for name, annotation in formatted_annotations.items(): + print(f"{name} : {annotation}") diff --git a/sklearn/base.py b/sklearn/base.py index 9edd5e368cafe..d451d9e3f387a 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -19,7 +19,6 @@ from .utils.validation import check_array from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args -from .utils._typing import add_types_to_docstring _DEFAULT_TAGS = { @@ -157,20 +156,6 @@ class BaseEstimator: arguments (no ``*args`` or ``**kwargs``). """ - def __init_subclass__(cls, inject_docstring=False, **kwargs): - if inject_docstring: - all_annotations = {} - if hasattr(cls, '__annotations__'): - all_annotations.update(cls.__annotations__) - if hasattr(cls.__init__, '__annotations__'): - all_annotations.update(cls.__init__.__annotations__) - params = inspect.signature(cls.__init__).parameters - defaults = {p: v.default for p, v in params.items() - if v.default != inspect.Parameter.empty} - cls.__doc__ = add_types_to_docstring(cls.__doc__, - all_annotations, - defaults) - @classmethod def _get_param_names(cls): """Get parameter names for the estimator""" diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index c4cc30f859074..7ef3d851fa109 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1016,7 +1016,7 @@ def _log_reg_scoring_path(X, y, train, test, pos_class=None, Cs=10, class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, - BaseEstimator, inject_docstring=True): + BaseEstimator): """ Logistic Regression (aka logit, MaxEnt) classifier. @@ -1043,7 +1043,7 @@ class LogisticRegression(LinearClassifierMixin, Parameters ---------- - penalty : + penalty : {'l1', 'l2', 'elasticnet', 'none'}, default='l2' Used to specify the norm used in the penalization. The 'newton-cg', 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is only supported by the 'saga' solver. If 'none' (not supported by the @@ -1052,24 +1052,24 @@ class LogisticRegression(LinearClassifierMixin, .. versionadded:: 0.19 l1 penalty with SAGA solver (allowing 'multinomial' + L1) - dual : + dual : bool, default=False Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. - tol : + tol : float, default=0.0001 Tolerance for stopping criteria. - C : + C : float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. - fit_intercept : + fit_intercept : bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. - intercept_scaling : + intercept_scaling : float, default=1 Useful only when the solver 'liblinear' is used and self.fit_intercept is set to True. In this case, x becomes [x, self.intercept_scaling], @@ -1082,7 +1082,7 @@ class LogisticRegression(LinearClassifierMixin, To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. - class_weight : + class_weight : dict or {'balanced'}, default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. @@ -1096,11 +1096,12 @@ class LogisticRegression(LinearClassifierMixin, .. versionadded:: 0.17 *class_weight='balanced'* - random_state : + random_state : int, RandomState instance or None, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details. - solver : + solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ + default='lbfgs' Algorithm to use in the optimization problem. @@ -1125,10 +1126,10 @@ class LogisticRegression(LinearClassifierMixin, .. versionchanged:: 0.22 The default solver changed from 'liblinear' to 'lbfgs' in 0.22. - max_iter : + max_iter : int, default=100 Maximum number of iterations taken for the solvers to converge. - multi_class : + multi_class : {'auto', 'ovr', 'multinomial'}, default='auto' If the option chosen is 'ovr', then a binary problem is fit for each label. For 'multinomial' the loss minimised is the multinomial loss fit across the entire probability distribution, *even when the data is @@ -1141,11 +1142,11 @@ class LogisticRegression(LinearClassifierMixin, .. versionchanged:: 0.22 Default changed from 'ovr' to 'auto' in 0.22. - verbose : + verbose : int, default=0 For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. - warm_start : + warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. See :term:`the Glossary `. @@ -1153,7 +1154,7 @@ class LogisticRegression(LinearClassifierMixin, .. versionadded:: 0.17 *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers. - n_jobs : + n_jobs : int or None, default=None Number of CPU cores used when parallelizing over classes if multi_class='ovr'". This parameter is ignored when the ``solver`` is set to 'liblinear' regardless of whether 'multi_class' is specified or @@ -1161,7 +1162,7 @@ class LogisticRegression(LinearClassifierMixin, context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - l1_ratio : + l1_ratio : float or None, default=None The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent @@ -1171,17 +1172,17 @@ class LogisticRegression(LinearClassifierMixin, Attributes ---------- - classes_ : + classes_ : ndarray of shape (n_classes,) A list of class labels known to the classifier. - coef_ : + coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. `coef_` is of shape (1, n_features) when the given problem is binary. In particular, when `multi_class='multinomial'`, `coef_` corresponds to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False). - intercept_ : + intercept_ : ndarray of shape (1,) or (n_classes,) Intercept (a.k.a. bias) added to the decision function. If `fit_intercept` is set to False, the intercept is set to zero. @@ -1190,7 +1191,7 @@ class LogisticRegression(LinearClassifierMixin, corresponds to outcome 1 (True) and `-intercept_` corresponds to outcome 0 (False). - n_iter_ : + n_iter_ : ndarray of shape (n_classes,) or (1,) Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum number of iteration across all classes is given. @@ -1255,12 +1256,11 @@ class LogisticRegression(LinearClassifierMixin, >>> clf.score(X, y) 0.97... """ - classes_: Annotated[np.ndarray, Shape(("n_classes",))] # noqa - coef_: Annotated[np.ndarray, Shape((1, "n_features"), # noqa - ("n_classes", "n_features"))] # noqa - intercept_: Annotated[np.ndarray, Shape((1,), # noqa - ("n_classes",))] # noqa - n_iter_: Annotated[np.ndarray, Shape(("n_classes",), (1,))] # noqa + classes_: Annotated[np.ndarray, Shape(("n_classes",))] + coef_: Annotated[np.ndarray, Shape((1, "n_features"), + ("n_classes", "n_features"))] + intercept_: Annotated[np.ndarray, Shape((1,), ("n_classes",))] + n_iter_: Annotated[np.ndarray, Shape(("n_classes",), (1,))] @_deprecate_positional_args def __init__(self, @@ -1280,7 +1280,7 @@ def __init__(self, verbose: int = 0, warm_start: bool = False, n_jobs: Optional[int] = None, - l1_ratio: float = None): + l1_ratio: Optional[float] = None): self.penalty = penalty self.dual = dual diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index b9f50a76f7b30..649797f3ff171 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -29,6 +29,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.utils import IS_PYPY from sklearn.utils._testing import SkipTest +from sklearn.utils._typing import get_annotations from sklearn.utils.estimator_checks import ( _construct_instance, _set_checking_parameters, @@ -204,3 +205,99 @@ def test_class_support_removed(): with pytest.raises(TypeError, match=msg): parametrize_with_checks([LogisticRegression]) + + + +TYPING_IGNORED = { + 'ARDRegression', 'AdaBoostClassifier', 'AdaBoostRegressor', + 'AdditiveChi2Sampler', 'AffinityPropagation', + 'AgglomerativeClustering', 'BaggingClassifier', 'BaggingRegressor', + 'BayesianGaussianMixture', 'BayesianRidge', 'BernoulliNB', + 'BernoulliRBM', 'Binarizer', 'Birch', 'CCA', 'CalibratedClassifierCV', + 'CategoricalNB', 'ClassifierChain', 'ColumnTransformer', + 'ComplementNB', 'CountVectorizer', 'DBSCAN', 'DecisionTreeClassifier', + 'DecisionTreeRegressor', 'DictVectorizer', 'DictionaryLearning', + 'DummyClassifier', 'DummyRegressor', 'ElasticNet', 'ElasticNetCV', + 'EllipticEnvelope', 'EmpiricalCovariance', 'ExtraTreeClassifier', + 'ExtraTreeRegressor', 'ExtraTreesClassifier', 'ExtraTreesRegressor', + 'FactorAnalysis', 'FastICA', 'FeatureAgglomeration', 'FeatureHasher', + 'FeatureUnion', 'FunctionTransformer', 'GammaRegressor', + 'GaussianMixture', 'GaussianNB', 'GaussianProcessClassifier', + 'GaussianProcessRegressor', 'GaussianRandomProjection', + 'GenericUnivariateSelect', 'GradientBoostingClassifier', + 'GradientBoostingRegressor', 'GraphicalLasso', 'GraphicalLassoCV', + 'GridSearchCV', 'HashingVectorizer', 'HistGradientBoostingClassifier', + 'HistGradientBoostingRegressor', 'HuberRegressor', 'IncrementalPCA', + 'IsolationForest', 'Isomap', 'IsotonicRegression', 'IterativeImputer', + 'KBinsDiscretizer', 'KMeans', 'KNNImputer', 'KNeighborsClassifier', + 'KNeighborsRegressor', 'KNeighborsTransformer', 'KernelCenterer', + 'KernelDensity', 'KernelPCA', 'KernelRidge', 'LabelBinarizer', + 'LabelEncoder', 'LabelPropagation', 'LabelSpreading', 'Lars', 'LarsCV', + 'Lasso', 'LassoCV', 'LassoLars', 'LassoLarsCV', 'LassoLarsIC', + 'LatentDirichletAllocation', 'LedoitWolf', + 'LinearDiscriminantAnalysis', 'LinearRegression', 'LinearSVC', + 'LinearSVR', 'LocalOutlierFactor', 'LocallyLinearEmbedding', + 'LogisticRegressionCV', 'MDS', 'MLPClassifier', + 'MLPRegressor', 'MaxAbsScaler', 'MeanShift', 'MinCovDet', + 'MinMaxScaler', 'MiniBatchDictionaryLearning', 'MiniBatchKMeans', + 'MiniBatchSparsePCA', 'MissingIndicator', 'MultiLabelBinarizer', + 'MultiOutputClassifier', 'MultiOutputRegressor', 'MultiTaskElasticNet', + 'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV', + 'MultinomialNB', 'NMF', 'NearestCentroid', 'NearestNeighbors', + 'NeighborhoodComponentsAnalysis', 'Normalizer', 'NuSVC', 'NuSVR', + 'Nystroem', 'OAS', 'OPTICS', 'OneClassSVM', 'OneHotEncoder', + 'OneVsOneClassifier', 'OneVsRestClassifier', 'OrdinalEncoder', + 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', + 'OutputCodeClassifier', 'PCA', 'PLSCanonical', 'PLSRegression', + 'PLSSVD', 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', + 'PatchExtractor', 'Perceptron', 'Pipeline', 'PoissonRegressor', + 'PolynomialFeatures', 'PowerTransformer', + 'QuadraticDiscriminantAnalysis', 'QuantileTransformer', + 'RANSACRegressor', 'RBFSampler', 'RFE', 'RFECV', + 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', + 'RadiusNeighborsTransformer', 'RandomForestClassifier', + 'RandomForestRegressor', 'RandomTreesEmbedding', 'RandomizedSearchCV', + 'RegressorChain', 'Ridge', 'RidgeCV', 'RidgeClassifier', + 'RidgeClassifierCV', 'RobustScaler', 'SGDClassifier', 'SGDRegressor', + 'SVC', 'SVR', 'SelectFdr', 'SelectFpr', 'SelectFromModel', 'SelectFwe', + 'SelectKBest', 'SelectPercentile', 'ShrunkCovariance', 'SimpleImputer', + 'SkewedChi2Sampler', 'SparseCoder', 'SparsePCA', + 'SparseRandomProjection', 'SpectralBiclustering', 'SpectralClustering', + 'SpectralCoclustering', 'SpectralEmbedding', 'StackingClassifier', + 'StackingRegressor', 'StandardScaler', 'TSNE', 'TfidfTransformer', + 'TfidfVectorizer', 'TheilSenRegressor', 'TransformedTargetRegressor', + 'TruncatedSVD', 'TweedieRegressor', 'VarianceThreshold', + 'VotingClassifier', 'VotingRegressor' +} + + +@pytest.mark.parametrize( + 'name, Estimator', [ + pytest.param( + name, Estimator, marks=pytest.mark.skipif( + name in TYPING_IGNORED, + reason="Estimator does not have annotations")) + for name, Estimator in all_estimators()]) +def test_estimators_typestring(name, Estimator): + # Check that docstring's type is formated correctly + docscrape = pytest.importorskip('numpydoc.docscrape') + + doc = docscrape.ClassDoc(Estimator) + parameters = doc['Parameters'] + parameter_annnotations = get_annotations(Estimator.__init__) + + for parameter in parameters: + name, type_str = parameter.name, parameter.type + # whitespaces are collapsed to one whitespace + type_str = ' '.join(parameter.type.split()) + assert parameter_annnotations[parameter.name] == type_str, ( + f"{name} has incorrectly formated docstring") + + attributes = doc['Attributes'] + attribute_annotations = get_annotations(Estimator) + for attribute in attributes: + name, type_str = attribute.name, attribute.type + # whitespaces are collapsed to one whitespace + type_str = ' '.join(attribute.type.split()) + assert attribute_annotations[name] == type_str, ( + f"{name} has incorrectly formated docstring") diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 52f6a8cab6caa..86dc617a892a2 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -1,22 +1,23 @@ -import re -import numpy as np +import inspect from typing import Union +from typing import Any +from typing import TypeVar from typing_extensions import Literal # noqa from typing_extensions import Annotated # noqa +import numpy as np + RandomState = Union[int, np.random.RandomState, None] +ArrayLike = TypeVar('ArrayLike') class Shape: def __init__(self, *shapes): - self.shapes = [] - for shape in shapes: - if not isinstance(shape, (tuple, list)): - self.shapes.append((shape, )) - else: - self.shapes.append(shape) + if any(not isinstance(s, tuple) for s in shapes): + raise ValueError("All shapes must be tuple") + self.shapes = shapes def _join_one(self, shape): if len(shape) == 1: @@ -29,85 +30,77 @@ def __repr__(self): return f"of shape {output}" -def format_annotation(annotation): - """Convert annotation to docstring""" - if annotation is None or annotation is type(None): # noqa +def get_annotation_class_name(annotation) -> str: + if annotation is None: return 'None' + elif annotation is Any: + return 'Any' + elif hasattr(annotation, '__metadata__'): + return 'Annotated' - if annotation in [int, bool, float, str, dict, np.ndarray]: + if getattr(annotation, '__qualname__', None): return annotation.__qualname__ + elif getattr(annotation, '_name', None): + # generic for >= 3.7 + return annotation._name - if hasattr(annotation, '__name__'): - name = annotation.__name__ - if name == 'BaseEstimator': - return 'estimator instance' - elif name == 'RandomState': - return 'int, RandomState instance, or None' - elif name == 'ArrayLike': - return 'array-like' - - if hasattr(annotation, '__origin__'): - origin = annotation.__origin__ - if hasattr(annotation, '__metadata__'): # Annotated - metadata = ', '.join(str(t) for t in annotation.__metadata__) - type_info = format_annotation(origin) - return f'{type_info} {metadata}' - - if getattr(origin, '__qualname__', None): - name = origin.__qualname__ - elif getattr(origin, '_name', None): - # Required for Union on Python 3.7+ - name = origin._name - else: - # Required for Union on Python < 3.7 - name = origin.__class__.__qualname__.lstrip('_') - - if name == 'Union': - values = [format_annotation(t) for t in annotation.__args__] - if len(values) == 2: - return ' or '.join(values) - # greater than 2 - first = ', '.join(values[:-1]) - return f'{first}, or {values[-1]}' - - elif name == "Literal": - values = ', '.join(format_annotation(t) - for t in annotation.__args__) - return f'{{{values}}}' - elif name == 'list': - values = ', '.join(format_annotation(t) - for t in annotation.__args__) - return f'list of {values}' - - return repr(annotation) - - -def add_types_to_docstring(docstring, annotations, defaults): - - indent_regex = r"^( +)Parameters\s*\n +[-=]{10}" - indent_match = re.search(indent_regex, docstring, flags=re.MULTILINE) - if not indent_match: - return docstring - n_indent = len(indent_match.group(1)) - - indent = " " * n_indent - param_regex = re.compile(f"{indent}(\\w+) :") - lines = docstring.split('\n') - - for lineno, line in enumerate(lines): - found_param = param_regex.match(line) - if not found_param: - continue - name = found_param.group(1) - - if name not in annotations: - continue - - annotation = annotations[name] - type_str = format_annotation(annotation) - new_line = f"{indent}{name} : {type_str}" - if name in defaults: - new_line += f", (default={defaults[name]})" - lines[lineno] = new_line + origin = getattr(annotation, '__origin__', None) + if origin: + return get_annotation_class_name(annotation.__origin__) + + if inspect.isclass(annotation): + annotation = annotation.__class__ + return annotation.__qualname__.lstrip('_') - return "\n".join(lines) + +def format_annotation(annotation): + """Convert annotation to docstring""" + class_name = get_annotation_class_name(annotation) + + if class_name == 'BaseEstimator': + return 'estimator instance' + elif class_name == 'ArrayLike': + return 'array-like' + elif class_name == 'NoneType': + return 'None' + elif class_name == 'RandomState': + return 'RandomState instance' + elif class_name == 'Annotated': + inner_annotation = format_annotation(annotation.__origin__) + args = ', '.join(repr(t) for t in annotation.__metadata__) + return f'{inner_annotation} {args}' + elif class_name == 'Union': + values = [format_annotation(t) for t in annotation.__args__] + if len(values) == 2: + return ' or '.join(values) + # greater than 2 + first = ', '.join(values[:-1]) + return f'{first} or {values[-1]}' + elif class_name == 'Literal': + values = ', '.join(repr(t) for t in annotation.__args__) + return f'{{{values}}}' + elif class_name in ('list', 'List'): + values = ', '.join(format_annotation(t) + for t in annotation.__args__) + return f'list of {values}' + + return class_name + + +def get_annotations(instance): + if not hasattr(instance, '__annotations__'): + raise ValueError(f"{instance} does not have annotations") + + annotations = instance.__annotations__ + # get defaults + params = inspect.signature(instance).parameters + defaults = {p: v.default for p, v in params.items() + if v.default != inspect.Parameter.empty} + + output = {} + for name, annotation in annotations.items(): + anno = format_annotation(annotation) + if name in defaults: + anno += f", default={repr(defaults[name])}" + output[name] = anno + return output From 4c477760bb7496bc20160ee74fa5f5d071d4e3a6 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 8 Jul 2020 09:21:32 -0400 Subject: [PATCH 07/38] STY Lint error --- sklearn/tests/test_common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index 649797f3ff171..ef81d0cf02414 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -207,7 +207,6 @@ def test_class_support_removed(): parametrize_with_checks([LogisticRegression]) - TYPING_IGNORED = { 'ARDRegression', 'AdaBoostClassifier', 'AdaBoostRegressor', 'AdditiveChi2Sampler', 'AffinityPropagation', From d8c85d057c0db1e482693fcda6608bf618a9b639 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 8 Jul 2020 10:42:53 -0400 Subject: [PATCH 08/38] MNT Adds F821 to ignore flag --- setup.cfg | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index f086993b26a29..81f9482e19d8d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,4 +26,5 @@ artifact_indexes= [flake8] # Default flake8 3.5 ignored flags -ignore=E121,E123,E126,E226,E24,E704,W503,W504 +# F821 is added when using Annotated for python typing +ignore=E121,E123,E126,E226,E24,E704,W503,W504,F821 From 2e67bc4c9740671a17a9a65f388acb0619f867a1 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 8 Jul 2020 20:08:02 -0400 Subject: [PATCH 09/38] BLD Try to fix typing-extensions --- sklearn/_build_utils/min_dependencies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_build_utils/min_dependencies.py b/sklearn/_build_utils/min_dependencies.py index f663c4c5463e4..992849f22f0db 100644 --- a/sklearn/_build_utils/min_dependencies.py +++ b/sklearn/_build_utils/min_dependencies.py @@ -40,7 +40,7 @@ 'sphinx-gallery': ('0.7.0', 'docs'), 'numpydoc': ('1.0.0', 'docs'), 'Pillow': ('7.1.2', 'docs'), - 'typing-extensions': ('3.7.4.2', 'build, install') + 'typing-extensions': ('3.7.4', 'build, install') } From cf9ff0caffe9a5f8c60581f7c46fa2bc25abc6c6 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Wed, 8 Jul 2020 21:16:04 -0400 Subject: [PATCH 10/38] BUG Fix link --- build_tools/azure/install.cmd | 4 ++-- sklearn/_build_utils/min_dependencies.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index caf28261dcc9f..5046bcd802300 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -15,7 +15,7 @@ IF "%PYTHON_ARCH%"=="64" ( call activate %VIRTUALENV% - pip install threadpoolctl + pip install threadpoolctl typing-extensions IF "%PYTEST_VERSION%"=="*" ( pip install pytest @@ -23,7 +23,7 @@ IF "%PYTHON_ARCH%"=="64" ( pip install pytest==%PYTEST_VERSION% ) ) else ( - pip install numpy scipy cython pytest wheel pillow joblib threadpoolctl + pip install numpy scipy cython pytest wheel pillow joblib threadpoolctl typing-extensions ) IF "%PYTEST_XDIST%" == "true" ( diff --git a/sklearn/_build_utils/min_dependencies.py b/sklearn/_build_utils/min_dependencies.py index 992849f22f0db..f663c4c5463e4 100644 --- a/sklearn/_build_utils/min_dependencies.py +++ b/sklearn/_build_utils/min_dependencies.py @@ -40,7 +40,7 @@ 'sphinx-gallery': ('0.7.0', 'docs'), 'numpydoc': ('1.0.0', 'docs'), 'Pillow': ('7.1.2', 'docs'), - 'typing-extensions': ('3.7.4', 'build, install') + 'typing-extensions': ('3.7.4.2', 'build, install') } From 2535e9f51c3706f4e3713bb0110a34616970181d Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 9 Jul 2020 18:07:24 -0400 Subject: [PATCH 11/38] CI Fixes version? --- sklearn/_build_utils/min_dependencies.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/_build_utils/min_dependencies.py b/sklearn/_build_utils/min_dependencies.py index f663c4c5463e4..992849f22f0db 100644 --- a/sklearn/_build_utils/min_dependencies.py +++ b/sklearn/_build_utils/min_dependencies.py @@ -40,7 +40,7 @@ 'sphinx-gallery': ('0.7.0', 'docs'), 'numpydoc': ('1.0.0', 'docs'), 'Pillow': ('7.1.2', 'docs'), - 'typing-extensions': ('3.7.4.2', 'build, install') + 'typing-extensions': ('3.7.4', 'build, install') } From d861af41b05c6ef8557f00ea70bce4ea438f145a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 3 Aug 2020 12:45:16 -0400 Subject: [PATCH 12/38] CLN Update function names --- sklearn/utils/_typing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 86dc617a892a2..01e25262fda47 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -19,14 +19,14 @@ def __init__(self, *shapes): raise ValueError("All shapes must be tuple") self.shapes = shapes - def _join_one(self, shape): + def _join(self, shape): if len(shape) == 1: return f"({shape[0]},)" inner = ', '.join(str(s) for s in shape) return f"({inner})" def __repr__(self): - output = ' or '.join(self._join_one(shape) for shape in self.shapes) + output = ' or '.join(self._join(shape) for shape in self.shapes) return f"of shape {output}" From 292c6bafa07f2089eeba059ac6d4427d8a64a19c Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 3 Aug 2020 14:43:59 -0400 Subject: [PATCH 13/38] WIP Adds tests --- build_tools/get_formatted_docstring_types.py | 22 ++---- setup.cfg | 3 +- sklearn/linear_model/_logistic.py | 8 --- sklearn/utils/_typing.py | 74 ++++++++------------ sklearn/utils/tests/test_typing.py | 38 ++++++++++ 5 files changed, 74 insertions(+), 71 deletions(-) create mode 100644 sklearn/utils/tests/test_typing.py diff --git a/build_tools/get_formatted_docstring_types.py b/build_tools/get_formatted_docstring_types.py index 930b5c8056cbc..64146299e5ab6 100644 --- a/build_tools/get_formatted_docstring_types.py +++ b/build_tools/get_formatted_docstring_types.py @@ -18,24 +18,14 @@ object_split = object_input.split(".") module = "sklearn." + ".".join(object_split[:-1]) -instance_str = object_split[-1] -instance = getattr(import_module(module), instance_str) - +obj_str = object_split[-1] +obj = getattr(import_module(module), obj_str) print("Parameters") print("----------") -if inspect.isclass(instance): - formatted_annotations = get_annotations(instance.__init__) -else: - formatted_annotations = get_annotations(instance) +if inspect.isclass(obj): + formatted_annotations = get_annotations(obj.__init__) +else: # function + formatted_annotations = get_annotations(obj) for name, annotation in formatted_annotations.items(): print(f"{name} : {annotation}") - - -if inspect.isclass(instance): - print() - print("Attributes") - print("----------") - formatted_annotations = get_annotations(instance) - for name, annotation in formatted_annotations.items(): - print(f"{name} : {annotation}") diff --git a/setup.cfg b/setup.cfg index 6a91561d47726..1a09d8872e9b1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,5 +27,4 @@ artifact_indexes= [flake8] # Default flake8 3.5 ignored flags -# F821 is added when using Annotated for python typing -ignore=E121,E123,E126,E226,E24,E704,W503,W504,F821 +ignore=E121,E123,E126,E226,E24,E704,W503,W504 diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 7ef3d851fa109..dc46a5386ad6b 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -20,8 +20,6 @@ from ..utils._typing import Literal from ..utils._typing import RandomState -from ..utils._typing import Annotated -from ..utils._typing import Shape from typing import Union from typing import Optional from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator @@ -1256,12 +1254,6 @@ class LogisticRegression(LinearClassifierMixin, >>> clf.score(X, y) 0.97... """ - classes_: Annotated[np.ndarray, Shape(("n_classes",))] - coef_: Annotated[np.ndarray, Shape((1, "n_features"), - ("n_classes", "n_features"))] - intercept_: Annotated[np.ndarray, Shape((1,), ("n_classes",))] - n_iter_: Annotated[np.ndarray, Shape(("n_classes",), (1,))] - @_deprecate_positional_args def __init__(self, penalty: Literal['l1', 'l2', 'elasticnet', 'none'] = 'l2', diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 01e25262fda47..30e386552db39 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -2,75 +2,47 @@ from typing import Union from typing import Any -from typing import TypeVar from typing_extensions import Literal # noqa -from typing_extensions import Annotated # noqa import numpy as np RandomState = Union[int, np.random.RandomState, None] -ArrayLike = TypeVar('ArrayLike') -class Shape: - def __init__(self, *shapes): - if any(not isinstance(s, tuple) for s in shapes): - raise ValueError("All shapes must be tuple") - self.shapes = shapes - - def _join(self, shape): - if len(shape) == 1: - return f"({shape[0]},)" - inner = ', '.join(str(s) for s in shape) - return f"({inner})" - - def __repr__(self): - output = ' or '.join(self._join(shape) for shape in self.shapes) - return f"of shape {output}" - - -def get_annotation_class_name(annotation) -> str: +def _get_annotation_class_name(annotation): + """Get class name for annnotation""" if annotation is None: return 'None' elif annotation is Any: return 'Any' - elif hasattr(annotation, '__metadata__'): - return 'Annotated' if getattr(annotation, '__qualname__', None): return annotation.__qualname__ elif getattr(annotation, '_name', None): - # generic for >= 3.7 + # generic for >= 3.7 return annotation._name origin = getattr(annotation, '__origin__', None) if origin: - return get_annotation_class_name(annotation.__origin__) + return _get_annotation_class_name(annotation.__origin__) - if inspect.isclass(annotation): - annotation = annotation.__class__ - return annotation.__qualname__.lstrip('_') + # generic for < 3.7 (Literal) + return annotation.__class__.__qualname__.lstrip('_') -def format_annotation(annotation): - """Convert annotation to docstring""" - class_name = get_annotation_class_name(annotation) +def _format_annotation(annotation): + """Convert annotation to docstring.""" + class_name = _get_annotation_class_name(annotation) if class_name == 'BaseEstimator': return 'estimator instance' - elif class_name == 'ArrayLike': - return 'array-like' elif class_name == 'NoneType': return 'None' elif class_name == 'RandomState': return 'RandomState instance' - elif class_name == 'Annotated': - inner_annotation = format_annotation(annotation.__origin__) - args = ', '.join(repr(t) for t in annotation.__metadata__) - return f'{inner_annotation} {args}' elif class_name == 'Union': - values = [format_annotation(t) for t in annotation.__args__] + values = [_format_annotation(t) for t in annotation.__args__] if len(values) == 2: return ' or '.join(values) # greater than 2 @@ -80,26 +52,38 @@ def format_annotation(annotation): values = ', '.join(repr(t) for t in annotation.__args__) return f'{{{values}}}' elif class_name in ('list', 'List'): - values = ', '.join(format_annotation(t) + values = ', '.join(_format_annotation(t) for t in annotation.__args__) return f'list of {values}' return class_name -def get_annotations(instance): - if not hasattr(instance, '__annotations__'): - raise ValueError(f"{instance} does not have annotations") +def get_annotations(obj): + """Get human readable docstring for types for a function or an estimator + with annotations. + + Parameters + ---------- + obj: callable or estimator class + + Returns + ------- + output: dict + dictionary mapping from name to human-readable docstring. + """ + if not hasattr(obj, '__annotations__'): + raise ValueError(f"{obj} does not have annotations") - annotations = instance.__annotations__ + annotations = obj.__annotations__ # get defaults - params = inspect.signature(instance).parameters + params = inspect.signature(obj).parameters defaults = {p: v.default for p, v in params.items() if v.default != inspect.Parameter.empty} output = {} for name, annotation in annotations.items(): - anno = format_annotation(annotation) + anno = _format_annotation(annotation) if name in defaults: anno += f", default={repr(defaults[name])}" output[name] = anno diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py new file mode 100644 index 0000000000000..1d72c98c62b3e --- /dev/null +++ b/sklearn/utils/tests/test_typing.py @@ -0,0 +1,38 @@ +from typing import Dict +from typing import Any +from typing import List +from typing import Union +from typing import Callable + +import pytest + +from sklearn.base import BaseEstimator +from sklearn.utils._typing import Literal +from sklearn.utils._typing import _get_annotation_class_name +# from sklearn.utils._typing import _format_annotation + + +@pytest.mark.parametrize("annotation, expected_class", [ + (None, 'None'), + (Any, 'Any'), + (str, 'str'), + (int, 'int'), + (float, 'float'), + (list, 'list'), + (BaseEstimator, 'BaseEstimator'), + (List[int], 'List'), + (Union[int, float], 'Union'), + (Dict, 'Dict'), + (Literal['a', 'b'], 'Literal'), + (Callable, 'Callable'), + (Callable[[str], str], 'Callable'), +]) +def test_get_annotation_class_name(annotation, expected_class): + assert _get_annotation_class_name(annotation) == expected_class + + +# @pytest.mark.parametrize("annotation", "expected_str", [ +# (None, 'None') +# ]) +# def test_format_annotation(annotation, expected_str): +# pass From 751a57830df221d4d3ed09d0137e5698d52a5657 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 4 Aug 2020 13:24:06 -0400 Subject: [PATCH 14/38] ENH Adds tests for typing --- sklearn/linear_model/_logistic.py | 5 +++ sklearn/tests/test_common.py | 29 ++++++++++------- sklearn/utils/_typing.py | 9 +++--- sklearn/utils/tests/test_typing.py | 50 +++++++++++++++++++++++++----- 4 files changed, 70 insertions(+), 23 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index dc46a5386ad6b..44b47c178c295 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1254,6 +1254,11 @@ class LogisticRegression(LinearClassifierMixin, >>> clf.score(X, y) 0.97... """ + classes_: np.ndarray + coef_: np.ndarray + intercept_: np.ndarray + n_iter_: np.ndarray + @_deprecate_positional_args def __init__(self, penalty: Literal['l1', 'l2', 'elasticnet', 'none'] = 'l2', diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index ef81d0cf02414..e55378398c435 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -284,19 +284,26 @@ def test_estimators_typestring(name, Estimator): doc = docscrape.ClassDoc(Estimator) parameters = doc['Parameters'] parameter_annnotations = get_annotations(Estimator.__init__) - - for parameter in parameters: - name, type_str = parameter.name, parameter.type - # whitespaces are collapsed to one whitespace - type_str = ' '.join(parameter.type.split()) - assert parameter_annnotations[parameter.name] == type_str, ( - f"{name} has incorrectly formated docstring") + _check_annotations(parameters, parameter_annnotations) attributes = doc['Attributes'] attribute_annotations = get_annotations(Estimator) - for attribute in attributes: - name, type_str = attribute.name, attribute.type + _check_annotations(attributes, attribute_annotations) + + +def _check_annotations(docstring_items, expected_annotations): + + assert len(docstring_items) == len(expected_annotations) + + for item in docstring_items: + name, type_str = item.name, item.type + + # skip annotations with "shape of" for now, this can be added when + # we support Annotated + if "of shape" in type_str: + continue + # whitespaces are collapsed to one whitespace - type_str = ' '.join(attribute.type.split()) - assert attribute_annotations[name] == type_str, ( + type_str = ' '.join(item.type.split()) + assert expected_annotations[name] == type_str, ( f"{name} has incorrectly formated docstring") diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 30e386552db39..24d6a0c52aafa 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -51,7 +51,7 @@ def _format_annotation(annotation): elif class_name == 'Literal': values = ', '.join(repr(t) for t in annotation.__args__) return f'{{{values}}}' - elif class_name in ('list', 'List'): + elif class_name == 'List': values = ', '.join(_format_annotation(t) for t in annotation.__args__) return f'list of {values}' @@ -60,12 +60,11 @@ def _format_annotation(annotation): def get_annotations(obj): - """Get human readable docstring for types for a function or an estimator - with annotations. + """Get human readable docstring for types for a obj with annotations. Parameters ---------- - obj: callable or estimator class + obj: object Returns ------- @@ -73,7 +72,7 @@ def get_annotations(obj): dictionary mapping from name to human-readable docstring. """ if not hasattr(obj, '__annotations__'): - raise ValueError(f"{obj} does not have annotations") + return {} annotations = obj.__annotations__ # get defaults diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index 1d72c98c62b3e..6de32346239bd 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -3,13 +3,17 @@ from typing import List from typing import Union from typing import Callable +from typing import Optional +from typing_extensions import Literal import pytest +import numpy as np from sklearn.base import BaseEstimator -from sklearn.utils._typing import Literal +from sklearn.utils._typing import RandomState from sklearn.utils._typing import _get_annotation_class_name -# from sklearn.utils._typing import _format_annotation +from sklearn.utils._typing import _format_annotation +from sklearn.utils._typing import get_annotations @pytest.mark.parametrize("annotation, expected_class", [ @@ -31,8 +35,40 @@ def test_get_annotation_class_name(annotation, expected_class): assert _get_annotation_class_name(annotation) == expected_class -# @pytest.mark.parametrize("annotation", "expected_str", [ -# (None, 'None') -# ]) -# def test_format_annotation(annotation, expected_str): -# pass +@pytest.mark.parametrize("annotation, expected_str", [ + (None, 'None'), + (BaseEstimator, 'estimator instance'), + (np.random.RandomState, 'RandomState instance'), + (int, 'int'), + (float, 'float'), + (list, 'list'), + (str, 'str'), + (List[int], 'list of int'), + (Optional[List[int]], 'list of int or None'), + (List[BaseEstimator], 'list of estimator instance'), + (Optional[BaseEstimator], 'estimator instance or None'), + (Union[int, float], 'int or float'), + (Literal['cat', 'dog'], '{\'cat\', \'dog\'}'), + (RandomState, 'int, RandomState instance or None') +]) +def test_format_annotation(annotation, expected_str): + assert _format_annotation(annotation) == expected_str + + +class TestObject: + def __init__(self, + estimator: BaseEstimator, + num: int = 10, union_num: Union[int, float] = 1.4, + pet: Literal['cat', 'dog'] = 'dog', + random_state: RandomState = None): + pass + + +def test_get_annotations(): + annotations = get_annotations(TestObject.__init__) + assert annotations['estimator'] == 'estimator instance' + assert annotations['num'] == 'int, default=10' + assert annotations['union_num'] == 'int or float, default=1.4' + assert annotations['pet'] == '{\'cat\', \'dog\'}, default=\'dog\'' + assert annotations['random_state'] == ('int, RandomState instance or None' + ', default=None') From ddb421ac42df2a1f400b375c642dedceab5f8ae3 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 11 Aug 2020 12:06:37 -0400 Subject: [PATCH 15/38] CLN Imports from typing extensions --- sklearn/linear_model/_logistic.py | 6 +++--- sklearn/utils/_typing.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 67c846c191905..a7a6a26b340b2 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -12,16 +12,16 @@ import numbers import warnings +from typing import Union +from typing import Optional +from typing_extensions import Literal import numpy as np from scipy import optimize, sparse from scipy.special import expit, logsumexp from joblib import Parallel, delayed, effective_n_jobs -from ..utils._typing import Literal from ..utils._typing import RandomState -from typing import Union -from typing import Optional from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator from ._sag import sag_solver from ..preprocessing import LabelEncoder, LabelBinarizer diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 24d6a0c52aafa..725c5efbe2dbc 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -2,7 +2,6 @@ from typing import Union from typing import Any -from typing_extensions import Literal # noqa import numpy as np From 5999a87695cff7f10c8ad533429dd68539679a2a Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 11 Aug 2020 15:14:51 -0400 Subject: [PATCH 16/38] REV --- sklearn/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sklearn/base.py b/sklearn/base.py index d451d9e3f387a..46398baabfd3a 100644 --- a/sklearn/base.py +++ b/sklearn/base.py @@ -20,7 +20,6 @@ from .utils._estimator_html_repr import estimator_html_repr from .utils.validation import _deprecate_positional_args - _DEFAULT_TAGS = { 'non_deterministic': False, 'requires_positive_X': False, From 0cff327ee07b1bed8114f3aca972d5bb9acb3cae Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 11 Aug 2020 15:30:44 -0400 Subject: [PATCH 17/38] MNT Moves tests --- build_tools/get_formatted_docstring_types.py | 6 +- sklearn/linear_model/_logistic.py | 2 +- sklearn/tests/test_common.py | 103 ------------------- sklearn/tests/test_docstring_parameters.py | 103 +++++++++++++++++++ sklearn/utils/_typing.py | 18 ++-- sklearn/utils/tests/test_typing.py | 16 +-- 6 files changed, 128 insertions(+), 120 deletions(-) diff --git a/build_tools/get_formatted_docstring_types.py b/build_tools/get_formatted_docstring_types.py index 64146299e5ab6..03320a926fb33 100644 --- a/build_tools/get_formatted_docstring_types.py +++ b/build_tools/get_formatted_docstring_types.py @@ -4,7 +4,7 @@ import inspect -from sklearn.utils._typing import get_annotations +from sklearn.utils._typing import get_docstring_annotations parser = argparse.ArgumentParser( @@ -24,8 +24,8 @@ print("Parameters") print("----------") if inspect.isclass(obj): - formatted_annotations = get_annotations(obj.__init__) + formatted_annotations = get_docstring_annotations(obj.__init__) else: # function - formatted_annotations = get_annotations(obj) + formatted_annotations = get_docstring_annotations(obj) for name, annotation in formatted_annotations.items(): print(f"{name} : {annotation}") diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index a7a6a26b340b2..c90a9d555ed7f 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1055,7 +1055,7 @@ class LogisticRegression(LinearClassifierMixin, l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. - tol : float, default=0.0001 + tol : float, default=1e-4 Tolerance for stopping criteria. C : float, default=1.0 diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py index c76586592dd1b..c41bdb1116a6c 100644 --- a/sklearn/tests/test_common.py +++ b/sklearn/tests/test_common.py @@ -32,7 +32,6 @@ from sklearn.svm import NuSVC from sklearn.utils import IS_PYPY from sklearn.utils._testing import SkipTest -from sklearn.utils._typing import get_annotations from sklearn.utils.estimator_checks import ( _construct_instance, _set_checking_parameters, @@ -210,108 +209,6 @@ def test_class_support_removed(): parametrize_with_checks([LogisticRegression]) -TYPING_IGNORED = { - 'ARDRegression', 'AdaBoostClassifier', 'AdaBoostRegressor', - 'AdditiveChi2Sampler', 'AffinityPropagation', - 'AgglomerativeClustering', 'BaggingClassifier', 'BaggingRegressor', - 'BayesianGaussianMixture', 'BayesianRidge', 'BernoulliNB', - 'BernoulliRBM', 'Binarizer', 'Birch', 'CCA', 'CalibratedClassifierCV', - 'CategoricalNB', 'ClassifierChain', 'ColumnTransformer', - 'ComplementNB', 'CountVectorizer', 'DBSCAN', 'DecisionTreeClassifier', - 'DecisionTreeRegressor', 'DictVectorizer', 'DictionaryLearning', - 'DummyClassifier', 'DummyRegressor', 'ElasticNet', 'ElasticNetCV', - 'EllipticEnvelope', 'EmpiricalCovariance', 'ExtraTreeClassifier', - 'ExtraTreeRegressor', 'ExtraTreesClassifier', 'ExtraTreesRegressor', - 'FactorAnalysis', 'FastICA', 'FeatureAgglomeration', 'FeatureHasher', - 'FeatureUnion', 'FunctionTransformer', 'GammaRegressor', - 'GaussianMixture', 'GaussianNB', 'GaussianProcessClassifier', - 'GaussianProcessRegressor', 'GaussianRandomProjection', - 'GenericUnivariateSelect', 'GradientBoostingClassifier', - 'GradientBoostingRegressor', 'GraphicalLasso', 'GraphicalLassoCV', - 'GridSearchCV', 'HashingVectorizer', 'HistGradientBoostingClassifier', - 'HistGradientBoostingRegressor', 'HuberRegressor', 'IncrementalPCA', - 'IsolationForest', 'Isomap', 'IsotonicRegression', 'IterativeImputer', - 'KBinsDiscretizer', 'KMeans', 'KNNImputer', 'KNeighborsClassifier', - 'KNeighborsRegressor', 'KNeighborsTransformer', 'KernelCenterer', - 'KernelDensity', 'KernelPCA', 'KernelRidge', 'LabelBinarizer', - 'LabelEncoder', 'LabelPropagation', 'LabelSpreading', 'Lars', 'LarsCV', - 'Lasso', 'LassoCV', 'LassoLars', 'LassoLarsCV', 'LassoLarsIC', - 'LatentDirichletAllocation', 'LedoitWolf', - 'LinearDiscriminantAnalysis', 'LinearRegression', 'LinearSVC', - 'LinearSVR', 'LocalOutlierFactor', 'LocallyLinearEmbedding', - 'LogisticRegressionCV', 'MDS', 'MLPClassifier', - 'MLPRegressor', 'MaxAbsScaler', 'MeanShift', 'MinCovDet', - 'MinMaxScaler', 'MiniBatchDictionaryLearning', 'MiniBatchKMeans', - 'MiniBatchSparsePCA', 'MissingIndicator', 'MultiLabelBinarizer', - 'MultiOutputClassifier', 'MultiOutputRegressor', 'MultiTaskElasticNet', - 'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV', - 'MultinomialNB', 'NMF', 'NearestCentroid', 'NearestNeighbors', - 'NeighborhoodComponentsAnalysis', 'Normalizer', 'NuSVC', 'NuSVR', - 'Nystroem', 'OAS', 'OPTICS', 'OneClassSVM', 'OneHotEncoder', - 'OneVsOneClassifier', 'OneVsRestClassifier', 'OrdinalEncoder', - 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', - 'OutputCodeClassifier', 'PCA', 'PLSCanonical', 'PLSRegression', - 'PLSSVD', 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', - 'PatchExtractor', 'Perceptron', 'Pipeline', 'PoissonRegressor', - 'PolynomialFeatures', 'PowerTransformer', - 'QuadraticDiscriminantAnalysis', 'QuantileTransformer', - 'RANSACRegressor', 'RBFSampler', 'RFE', 'RFECV', - 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', - 'RadiusNeighborsTransformer', 'RandomForestClassifier', - 'RandomForestRegressor', 'RandomTreesEmbedding', 'RandomizedSearchCV', - 'RegressorChain', 'Ridge', 'RidgeCV', 'RidgeClassifier', - 'RidgeClassifierCV', 'RobustScaler', 'SGDClassifier', 'SGDRegressor', - 'SVC', 'SVR', 'SelectFdr', 'SelectFpr', 'SelectFromModel', 'SelectFwe', - 'SelectKBest', 'SelectPercentile', 'ShrunkCovariance', 'SimpleImputer', - 'SkewedChi2Sampler', 'SparseCoder', 'SparsePCA', - 'SparseRandomProjection', 'SpectralBiclustering', 'SpectralClustering', - 'SpectralCoclustering', 'SpectralEmbedding', 'StackingClassifier', - 'StackingRegressor', 'StandardScaler', 'TSNE', 'TfidfTransformer', - 'TfidfVectorizer', 'TheilSenRegressor', 'TransformedTargetRegressor', - 'TruncatedSVD', 'TweedieRegressor', 'VarianceThreshold', - 'VotingClassifier', 'VotingRegressor' -} - - -@pytest.mark.parametrize( - 'name, Estimator', [ - pytest.param( - name, Estimator, marks=pytest.mark.skipif( - name in TYPING_IGNORED, - reason="Estimator does not have annotations")) - for name, Estimator in all_estimators()]) -def test_estimators_typestring(name, Estimator): - # Check that docstring's type is formated correctly - docscrape = pytest.importorskip('numpydoc.docscrape') - - doc = docscrape.ClassDoc(Estimator) - parameters = doc['Parameters'] - parameter_annnotations = get_annotations(Estimator.__init__) - _check_annotations(parameters, parameter_annnotations) - - attributes = doc['Attributes'] - attribute_annotations = get_annotations(Estimator) - _check_annotations(attributes, attribute_annotations) - - -def _check_annotations(docstring_items, expected_annotations): - - assert len(docstring_items) == len(expected_annotations) - - for item in docstring_items: - name, type_str = item.name, item.type - - # skip annotations with "shape of" for now, this can be added when - # we support Annotated - if "of shape" in type_str: - continue - - # whitespaces are collapsed to one whitespace - type_str = ' '.join(item.type.split()) - assert expected_annotations[name] == type_str, ( - f"{name} has incorrectly formated docstring") - - class MyNMFWithBadErrorMessage(NMF): # Same as NMF but raises an uninformative error message if X has negative # value. This estimator would fail the check suite in strict mode, diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index a48af83b15a7a..392ce70088e98 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -23,6 +23,7 @@ from sklearn.utils.deprecation import _is_deprecated from sklearn.externals._pep562 import Pep562 from sklearn.datasets import make_classification +from sklearn.utils._typing import get_docstring_annotations import pytest @@ -248,3 +249,105 @@ def test_fit_docstring_attributes(name, Estimator): undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes) assert not undocumented_attrs,\ "Undocumented attributes: {}".format(undocumented_attrs) + + +TYPING_IGNORED = { + 'ARDRegression', 'AdaBoostClassifier', 'AdaBoostRegressor', + 'AdditiveChi2Sampler', 'AffinityPropagation', + 'AgglomerativeClustering', 'BaggingClassifier', 'BaggingRegressor', + 'BayesianGaussianMixture', 'BayesianRidge', 'BernoulliNB', + 'BernoulliRBM', 'Binarizer', 'Birch', 'CCA', 'CalibratedClassifierCV', + 'CategoricalNB', 'ClassifierChain', 'ColumnTransformer', + 'ComplementNB', 'CountVectorizer', 'DBSCAN', 'DecisionTreeClassifier', + 'DecisionTreeRegressor', 'DictVectorizer', 'DictionaryLearning', + 'DummyClassifier', 'DummyRegressor', 'ElasticNet', 'ElasticNetCV', + 'EllipticEnvelope', 'EmpiricalCovariance', 'ExtraTreeClassifier', + 'ExtraTreeRegressor', 'ExtraTreesClassifier', 'ExtraTreesRegressor', + 'FactorAnalysis', 'FastICA', 'FeatureAgglomeration', 'FeatureHasher', + 'FeatureUnion', 'FunctionTransformer', 'GammaRegressor', + 'GaussianMixture', 'GaussianNB', 'GaussianProcessClassifier', + 'GaussianProcessRegressor', 'GaussianRandomProjection', + 'GenericUnivariateSelect', 'GradientBoostingClassifier', + 'GradientBoostingRegressor', 'GraphicalLasso', 'GraphicalLassoCV', + 'GridSearchCV', 'HashingVectorizer', 'HistGradientBoostingClassifier', + 'HistGradientBoostingRegressor', 'HuberRegressor', 'IncrementalPCA', + 'IsolationForest', 'Isomap', 'IsotonicRegression', 'IterativeImputer', + 'KBinsDiscretizer', 'KMeans', 'KNNImputer', 'KNeighborsClassifier', + 'KNeighborsRegressor', 'KNeighborsTransformer', 'KernelCenterer', + 'KernelDensity', 'KernelPCA', 'KernelRidge', 'LabelBinarizer', + 'LabelEncoder', 'LabelPropagation', 'LabelSpreading', 'Lars', 'LarsCV', + 'Lasso', 'LassoCV', 'LassoLars', 'LassoLarsCV', 'LassoLarsIC', + 'LatentDirichletAllocation', 'LedoitWolf', + 'LinearDiscriminantAnalysis', 'LinearRegression', 'LinearSVC', + 'LinearSVR', 'LocalOutlierFactor', 'LocallyLinearEmbedding', + 'LogisticRegressionCV', 'MDS', 'MLPClassifier', + 'MLPRegressor', 'MaxAbsScaler', 'MeanShift', 'MinCovDet', + 'MinMaxScaler', 'MiniBatchDictionaryLearning', 'MiniBatchKMeans', + 'MiniBatchSparsePCA', 'MissingIndicator', 'MultiLabelBinarizer', + 'MultiOutputClassifier', 'MultiOutputRegressor', 'MultiTaskElasticNet', + 'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV', + 'MultinomialNB', 'NMF', 'NearestCentroid', 'NearestNeighbors', + 'NeighborhoodComponentsAnalysis', 'Normalizer', 'NuSVC', 'NuSVR', + 'Nystroem', 'OAS', 'OPTICS', 'OneClassSVM', 'OneHotEncoder', + 'OneVsOneClassifier', 'OneVsRestClassifier', 'OrdinalEncoder', + 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', + 'OutputCodeClassifier', 'PCA', 'PLSCanonical', 'PLSRegression', + 'PLSSVD', 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', + 'PatchExtractor', 'Perceptron', 'Pipeline', 'PoissonRegressor', + 'PolynomialFeatures', 'PowerTransformer', + 'QuadraticDiscriminantAnalysis', 'QuantileTransformer', + 'RANSACRegressor', 'RBFSampler', 'RFE', 'RFECV', + 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', + 'RadiusNeighborsTransformer', 'RandomForestClassifier', + 'RandomForestRegressor', 'RandomTreesEmbedding', 'RandomizedSearchCV', + 'RegressorChain', 'Ridge', 'RidgeCV', 'RidgeClassifier', + 'RidgeClassifierCV', 'RobustScaler', 'SGDClassifier', 'SGDRegressor', + 'SVC', 'SVR', 'SelectFdr', 'SelectFpr', 'SelectFromModel', 'SelectFwe', + 'SelectKBest', 'SelectPercentile', 'ShrunkCovariance', 'SimpleImputer', + 'SkewedChi2Sampler', 'SparseCoder', 'SparsePCA', + 'SparseRandomProjection', 'SpectralBiclustering', 'SpectralClustering', + 'SpectralCoclustering', 'SpectralEmbedding', 'StackingClassifier', + 'StackingRegressor', 'StandardScaler', 'TSNE', 'TfidfTransformer', + 'TfidfVectorizer', 'TheilSenRegressor', 'TransformedTargetRegressor', + 'TruncatedSVD', 'TweedieRegressor', 'VarianceThreshold', + 'VotingClassifier', 'VotingRegressor' +} + + +@pytest.mark.parametrize( + 'name, Estimator', [ + pytest.param( + name, Estimator, marks=pytest.mark.skipif( + name in TYPING_IGNORED, + reason="Estimator does not have annotations")) + for name, Estimator in all_estimators()]) +def test_estimators_typestring(name, Estimator): + # Check that docstring's type is formated correctly + docscrape = pytest.importorskip('numpydoc.docscrape') + + doc = docscrape.ClassDoc(Estimator) + parameters = doc['Parameters'] + parameter_annnotations = get_docstring_annotations(Estimator.__init__) + _check_annotations(parameters, parameter_annnotations) + + attributes = doc['Attributes'] + attribute_annotations = get_docstring_annotations(Estimator) + _check_annotations(attributes, attribute_annotations) + + +def _check_annotations(docstring_items, expected_annotations): + + assert len(docstring_items) == len(expected_annotations) + + for item in docstring_items: + name, type_str = item.name, item.type + + # skip annotations with "shape of" for now, this can be added when + # we support Annotated + if "of shape" in type_str: + continue + + # whitespaces are collapsed to one whitespace + type_str = ' '.join(item.type.split()) + assert type_str.startswith(expected_annotations[name]), ( + f"{name} has incorrectly formated docstring") diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 725c5efbe2dbc..d291eeb9d0805 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -1,4 +1,5 @@ import inspect +import numbers from typing import Union from typing import Any @@ -30,7 +31,7 @@ def _get_annotation_class_name(annotation): return annotation.__class__.__qualname__.lstrip('_') -def _format_annotation(annotation): +def _format_docstring_annotation(annotation): """Convert annotation to docstring.""" class_name = _get_annotation_class_name(annotation) @@ -41,7 +42,7 @@ def _format_annotation(annotation): elif class_name == 'RandomState': return 'RandomState instance' elif class_name == 'Union': - values = [_format_annotation(t) for t in annotation.__args__] + values = [_format_docstring_annotation(t) for t in annotation.__args__] if len(values) == 2: return ' or '.join(values) # greater than 2 @@ -51,14 +52,14 @@ def _format_annotation(annotation): values = ', '.join(repr(t) for t in annotation.__args__) return f'{{{values}}}' elif class_name == 'List': - values = ', '.join(_format_annotation(t) + values = ', '.join(_format_docstring_annotation(t) for t in annotation.__args__) return f'list of {values}' return class_name -def get_annotations(obj): +def get_docstring_annotations(obj): """Get human readable docstring for types for a obj with annotations. Parameters @@ -81,8 +82,13 @@ def get_annotations(obj): output = {} for name, annotation in annotations.items(): - anno = _format_annotation(annotation) + anno = _format_docstring_annotation(annotation) if name in defaults: - anno += f", default={repr(defaults[name])}" + default = defaults[name] + if (isinstance(default, numbers.Real) and + not isinstance(default, numbers.Integral)): + anno += ", default=" + else: + anno += f", default={repr(default)}" output[name] = anno return output diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index 6de32346239bd..cce4baf4d615d 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -12,8 +12,8 @@ from sklearn.base import BaseEstimator from sklearn.utils._typing import RandomState from sklearn.utils._typing import _get_annotation_class_name -from sklearn.utils._typing import _format_annotation -from sklearn.utils._typing import get_annotations +from sklearn.utils._typing import _format_docstring_annotation +from sklearn.utils._typing import get_docstring_annotations @pytest.mark.parametrize("annotation, expected_class", [ @@ -51,24 +51,26 @@ def test_get_annotation_class_name(annotation, expected_class): (Literal['cat', 'dog'], '{\'cat\', \'dog\'}'), (RandomState, 'int, RandomState instance or None') ]) -def test_format_annotation(annotation, expected_str): - assert _format_annotation(annotation) == expected_str +def test_format_docstring_annotation(annotation, expected_str): + assert _format_docstring_annotation(annotation) == expected_str class TestObject: def __init__(self, estimator: BaseEstimator, num: int = 10, union_num: Union[int, float] = 1.4, + float_num: float = 1e-4, pet: Literal['cat', 'dog'] = 'dog', random_state: RandomState = None): pass -def test_get_annotations(): - annotations = get_annotations(TestObject.__init__) +def test_get_docstring_annotations(): + annotations = get_docstring_annotations(TestObject.__init__) assert annotations['estimator'] == 'estimator instance' assert annotations['num'] == 'int, default=10' - assert annotations['union_num'] == 'int or float, default=1.4' + assert annotations['float_num'] == 'float, default=' + assert annotations['union_num'] == 'int or float, default=' assert annotations['pet'] == '{\'cat\', \'dog\'}, default=\'dog\'' assert annotations['random_state'] == ('int, RandomState instance or None' ', default=None') From 7b6ad2c47fa3210abbc1c73cbdbb69cd2b678841 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 11 Aug 2020 15:36:59 -0400 Subject: [PATCH 18/38] MNT Move test to own file --- sklearn/tests/test_docstring_parameters.py | 103 -------------------- sklearn/tests/test_docstring_types.py | 106 +++++++++++++++++++++ 2 files changed, 106 insertions(+), 103 deletions(-) create mode 100644 sklearn/tests/test_docstring_types.py diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py index 392ce70088e98..a48af83b15a7a 100644 --- a/sklearn/tests/test_docstring_parameters.py +++ b/sklearn/tests/test_docstring_parameters.py @@ -23,7 +23,6 @@ from sklearn.utils.deprecation import _is_deprecated from sklearn.externals._pep562 import Pep562 from sklearn.datasets import make_classification -from sklearn.utils._typing import get_docstring_annotations import pytest @@ -249,105 +248,3 @@ def test_fit_docstring_attributes(name, Estimator): undocumented_attrs = set(undocumented_attrs).difference(skipped_attributes) assert not undocumented_attrs,\ "Undocumented attributes: {}".format(undocumented_attrs) - - -TYPING_IGNORED = { - 'ARDRegression', 'AdaBoostClassifier', 'AdaBoostRegressor', - 'AdditiveChi2Sampler', 'AffinityPropagation', - 'AgglomerativeClustering', 'BaggingClassifier', 'BaggingRegressor', - 'BayesianGaussianMixture', 'BayesianRidge', 'BernoulliNB', - 'BernoulliRBM', 'Binarizer', 'Birch', 'CCA', 'CalibratedClassifierCV', - 'CategoricalNB', 'ClassifierChain', 'ColumnTransformer', - 'ComplementNB', 'CountVectorizer', 'DBSCAN', 'DecisionTreeClassifier', - 'DecisionTreeRegressor', 'DictVectorizer', 'DictionaryLearning', - 'DummyClassifier', 'DummyRegressor', 'ElasticNet', 'ElasticNetCV', - 'EllipticEnvelope', 'EmpiricalCovariance', 'ExtraTreeClassifier', - 'ExtraTreeRegressor', 'ExtraTreesClassifier', 'ExtraTreesRegressor', - 'FactorAnalysis', 'FastICA', 'FeatureAgglomeration', 'FeatureHasher', - 'FeatureUnion', 'FunctionTransformer', 'GammaRegressor', - 'GaussianMixture', 'GaussianNB', 'GaussianProcessClassifier', - 'GaussianProcessRegressor', 'GaussianRandomProjection', - 'GenericUnivariateSelect', 'GradientBoostingClassifier', - 'GradientBoostingRegressor', 'GraphicalLasso', 'GraphicalLassoCV', - 'GridSearchCV', 'HashingVectorizer', 'HistGradientBoostingClassifier', - 'HistGradientBoostingRegressor', 'HuberRegressor', 'IncrementalPCA', - 'IsolationForest', 'Isomap', 'IsotonicRegression', 'IterativeImputer', - 'KBinsDiscretizer', 'KMeans', 'KNNImputer', 'KNeighborsClassifier', - 'KNeighborsRegressor', 'KNeighborsTransformer', 'KernelCenterer', - 'KernelDensity', 'KernelPCA', 'KernelRidge', 'LabelBinarizer', - 'LabelEncoder', 'LabelPropagation', 'LabelSpreading', 'Lars', 'LarsCV', - 'Lasso', 'LassoCV', 'LassoLars', 'LassoLarsCV', 'LassoLarsIC', - 'LatentDirichletAllocation', 'LedoitWolf', - 'LinearDiscriminantAnalysis', 'LinearRegression', 'LinearSVC', - 'LinearSVR', 'LocalOutlierFactor', 'LocallyLinearEmbedding', - 'LogisticRegressionCV', 'MDS', 'MLPClassifier', - 'MLPRegressor', 'MaxAbsScaler', 'MeanShift', 'MinCovDet', - 'MinMaxScaler', 'MiniBatchDictionaryLearning', 'MiniBatchKMeans', - 'MiniBatchSparsePCA', 'MissingIndicator', 'MultiLabelBinarizer', - 'MultiOutputClassifier', 'MultiOutputRegressor', 'MultiTaskElasticNet', - 'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV', - 'MultinomialNB', 'NMF', 'NearestCentroid', 'NearestNeighbors', - 'NeighborhoodComponentsAnalysis', 'Normalizer', 'NuSVC', 'NuSVR', - 'Nystroem', 'OAS', 'OPTICS', 'OneClassSVM', 'OneHotEncoder', - 'OneVsOneClassifier', 'OneVsRestClassifier', 'OrdinalEncoder', - 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', - 'OutputCodeClassifier', 'PCA', 'PLSCanonical', 'PLSRegression', - 'PLSSVD', 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', - 'PatchExtractor', 'Perceptron', 'Pipeline', 'PoissonRegressor', - 'PolynomialFeatures', 'PowerTransformer', - 'QuadraticDiscriminantAnalysis', 'QuantileTransformer', - 'RANSACRegressor', 'RBFSampler', 'RFE', 'RFECV', - 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', - 'RadiusNeighborsTransformer', 'RandomForestClassifier', - 'RandomForestRegressor', 'RandomTreesEmbedding', 'RandomizedSearchCV', - 'RegressorChain', 'Ridge', 'RidgeCV', 'RidgeClassifier', - 'RidgeClassifierCV', 'RobustScaler', 'SGDClassifier', 'SGDRegressor', - 'SVC', 'SVR', 'SelectFdr', 'SelectFpr', 'SelectFromModel', 'SelectFwe', - 'SelectKBest', 'SelectPercentile', 'ShrunkCovariance', 'SimpleImputer', - 'SkewedChi2Sampler', 'SparseCoder', 'SparsePCA', - 'SparseRandomProjection', 'SpectralBiclustering', 'SpectralClustering', - 'SpectralCoclustering', 'SpectralEmbedding', 'StackingClassifier', - 'StackingRegressor', 'StandardScaler', 'TSNE', 'TfidfTransformer', - 'TfidfVectorizer', 'TheilSenRegressor', 'TransformedTargetRegressor', - 'TruncatedSVD', 'TweedieRegressor', 'VarianceThreshold', - 'VotingClassifier', 'VotingRegressor' -} - - -@pytest.mark.parametrize( - 'name, Estimator', [ - pytest.param( - name, Estimator, marks=pytest.mark.skipif( - name in TYPING_IGNORED, - reason="Estimator does not have annotations")) - for name, Estimator in all_estimators()]) -def test_estimators_typestring(name, Estimator): - # Check that docstring's type is formated correctly - docscrape = pytest.importorskip('numpydoc.docscrape') - - doc = docscrape.ClassDoc(Estimator) - parameters = doc['Parameters'] - parameter_annnotations = get_docstring_annotations(Estimator.__init__) - _check_annotations(parameters, parameter_annnotations) - - attributes = doc['Attributes'] - attribute_annotations = get_docstring_annotations(Estimator) - _check_annotations(attributes, attribute_annotations) - - -def _check_annotations(docstring_items, expected_annotations): - - assert len(docstring_items) == len(expected_annotations) - - for item in docstring_items: - name, type_str = item.name, item.type - - # skip annotations with "shape of" for now, this can be added when - # we support Annotated - if "of shape" in type_str: - continue - - # whitespaces are collapsed to one whitespace - type_str = ' '.join(item.type.split()) - assert type_str.startswith(expected_annotations[name]), ( - f"{name} has incorrectly formated docstring") diff --git a/sklearn/tests/test_docstring_types.py b/sklearn/tests/test_docstring_types.py new file mode 100644 index 0000000000000..59821ff5b9b56 --- /dev/null +++ b/sklearn/tests/test_docstring_types.py @@ -0,0 +1,106 @@ +import pytest + +from sklearn.utils import all_estimators +from sklearn.utils._typing import get_docstring_annotations + + +TYPING_IGNORED = { + 'ARDRegression', 'AdaBoostClassifier', 'AdaBoostRegressor', + 'AdditiveChi2Sampler', 'AffinityPropagation', + 'AgglomerativeClustering', 'BaggingClassifier', 'BaggingRegressor', + 'BayesianGaussianMixture', 'BayesianRidge', 'BernoulliNB', + 'BernoulliRBM', 'Binarizer', 'Birch', 'CCA', 'CalibratedClassifierCV', + 'CategoricalNB', 'ClassifierChain', 'ColumnTransformer', + 'ComplementNB', 'CountVectorizer', 'DBSCAN', 'DecisionTreeClassifier', + 'DecisionTreeRegressor', 'DictVectorizer', 'DictionaryLearning', + 'DummyClassifier', 'DummyRegressor', 'ElasticNet', 'ElasticNetCV', + 'EllipticEnvelope', 'EmpiricalCovariance', 'ExtraTreeClassifier', + 'ExtraTreeRegressor', 'ExtraTreesClassifier', 'ExtraTreesRegressor', + 'FactorAnalysis', 'FastICA', 'FeatureAgglomeration', 'FeatureHasher', + 'FeatureUnion', 'FunctionTransformer', 'GammaRegressor', + 'GaussianMixture', 'GaussianNB', 'GaussianProcessClassifier', + 'GaussianProcessRegressor', 'GaussianRandomProjection', + 'GenericUnivariateSelect', 'GradientBoostingClassifier', + 'GradientBoostingRegressor', 'GraphicalLasso', 'GraphicalLassoCV', + 'GridSearchCV', 'HashingVectorizer', 'HistGradientBoostingClassifier', + 'HistGradientBoostingRegressor', 'HuberRegressor', 'IncrementalPCA', + 'IsolationForest', 'Isomap', 'IsotonicRegression', 'IterativeImputer', + 'KBinsDiscretizer', 'KMeans', 'KNNImputer', 'KNeighborsClassifier', + 'KNeighborsRegressor', 'KNeighborsTransformer', 'KernelCenterer', + 'KernelDensity', 'KernelPCA', 'KernelRidge', 'LabelBinarizer', + 'LabelEncoder', 'LabelPropagation', 'LabelSpreading', 'Lars', 'LarsCV', + 'Lasso', 'LassoCV', 'LassoLars', 'LassoLarsCV', 'LassoLarsIC', + 'LatentDirichletAllocation', 'LedoitWolf', + 'LinearDiscriminantAnalysis', 'LinearRegression', 'LinearSVC', + 'LinearSVR', 'LocalOutlierFactor', 'LocallyLinearEmbedding', + 'LogisticRegressionCV', 'MDS', 'MLPClassifier', + 'MLPRegressor', 'MaxAbsScaler', 'MeanShift', 'MinCovDet', + 'MinMaxScaler', 'MiniBatchDictionaryLearning', 'MiniBatchKMeans', + 'MiniBatchSparsePCA', 'MissingIndicator', 'MultiLabelBinarizer', + 'MultiOutputClassifier', 'MultiOutputRegressor', 'MultiTaskElasticNet', + 'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV', + 'MultinomialNB', 'NMF', 'NearestCentroid', 'NearestNeighbors', + 'NeighborhoodComponentsAnalysis', 'Normalizer', 'NuSVC', 'NuSVR', + 'Nystroem', 'OAS', 'OPTICS', 'OneClassSVM', 'OneHotEncoder', + 'OneVsOneClassifier', 'OneVsRestClassifier', 'OrdinalEncoder', + 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', + 'OutputCodeClassifier', 'PCA', 'PLSCanonical', 'PLSRegression', + 'PLSSVD', 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', + 'PatchExtractor', 'Perceptron', 'Pipeline', 'PoissonRegressor', + 'PolynomialFeatures', 'PowerTransformer', + 'QuadraticDiscriminantAnalysis', 'QuantileTransformer', + 'RANSACRegressor', 'RBFSampler', 'RFE', 'RFECV', + 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', + 'RadiusNeighborsTransformer', 'RandomForestClassifier', + 'RandomForestRegressor', 'RandomTreesEmbedding', 'RandomizedSearchCV', + 'RegressorChain', 'Ridge', 'RidgeCV', 'RidgeClassifier', + 'RidgeClassifierCV', 'RobustScaler', 'SGDClassifier', 'SGDRegressor', + 'SVC', 'SVR', 'SelectFdr', 'SelectFpr', 'SelectFromModel', 'SelectFwe', + 'SelectKBest', 'SelectPercentile', 'ShrunkCovariance', 'SimpleImputer', + 'SkewedChi2Sampler', 'SparseCoder', 'SparsePCA', + 'SparseRandomProjection', 'SpectralBiclustering', 'SpectralClustering', + 'SpectralCoclustering', 'SpectralEmbedding', 'StackingClassifier', + 'StackingRegressor', 'StandardScaler', 'TSNE', 'TfidfTransformer', + 'TfidfVectorizer', 'TheilSenRegressor', 'TransformedTargetRegressor', + 'TruncatedSVD', 'TweedieRegressor', 'VarianceThreshold', + 'VotingClassifier', 'VotingRegressor' +} + + +@pytest.mark.parametrize( + 'name, Estimator', [ + pytest.param( + name, Estimator, marks=pytest.mark.skipif( + name in TYPING_IGNORED, + reason="Estimator does not have annotations")) + for name, Estimator in all_estimators()]) +def test_estimators_typestring(name, Estimator): + # Check that docstring's type is formated correctly + docscrape = pytest.importorskip('numpydoc.docscrape') + + doc = docscrape.ClassDoc(Estimator) + parameters = doc['Parameters'] + parameter_annnotations = get_docstring_annotations(Estimator.__init__) + _check_annotations(parameters, parameter_annnotations) + + attributes = doc['Attributes'] + attribute_annotations = get_docstring_annotations(Estimator) + _check_annotations(attributes, attribute_annotations) + + +def _check_annotations(docstring_items, expected_annotations): + + assert len(docstring_items) == len(expected_annotations) + + for item in docstring_items: + name, type_str = item.name, item.type + + # skip annotations with "shape of" for now, this can be added when + # we support Annotated + if "of shape" in type_str: + continue + + # whitespaces are collapsed to one whitespace + type_str = ' '.join(item.type.split()) + assert type_str.startswith(expected_annotations[name]), ( + f"{name} has incorrectly formated docstring") From e4ea4d8ce6161c78e556b247ac624e7bb306c191 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 11 Aug 2020 15:54:08 -0400 Subject: [PATCH 19/38] DOC Adds comment --- sklearn/utils/_typing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index d291eeb9d0805..8f0d9a0410c05 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -87,6 +87,8 @@ def get_docstring_annotations(obj): default = defaults[name] if (isinstance(default, numbers.Real) and not isinstance(default, numbers.Integral)): + # For floats the representation can vary, i.e: + # default=np.inf or default=1e-4 anno += ", default=" else: anno += f", default={repr(default)}" From cadb711eaf293a5edb78749aa04766cd327fad52 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 11 Aug 2020 15:59:04 -0400 Subject: [PATCH 20/38] ENH Single literal --- sklearn/linear_model/_logistic.py | 2 +- sklearn/utils/_typing.py | 5 ++++- sklearn/utils/tests/test_typing.py | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index c90a9d555ed7f..019b53be0b91e 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1080,7 +1080,7 @@ class LogisticRegression(LinearClassifierMixin, To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) intercept_scaling has to be increased. - class_weight : dict or {'balanced'}, default=None + class_weight : dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 8f0d9a0410c05..fef0ab47e101d 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -49,7 +49,10 @@ def _format_docstring_annotation(annotation): first = ', '.join(values[:-1]) return f'{first} or {values[-1]}' elif class_name == 'Literal': - values = ', '.join(repr(t) for t in annotation.__args__) + items = [repr(t) for t in annotation.__args__] + if len(items) == 1: + return items[0] + values = ', '.join(items) return f'{{{values}}}' elif class_name == 'List': values = ', '.join(_format_docstring_annotation(t) diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index cce4baf4d615d..f38ea0b1f708a 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -49,6 +49,7 @@ def test_get_annotation_class_name(annotation, expected_class): (Optional[BaseEstimator], 'estimator instance or None'), (Union[int, float], 'int or float'), (Literal['cat', 'dog'], '{\'cat\', \'dog\'}'), + (Literal['cat'], '\'cat\''), (RandomState, 'int, RandomState instance or None') ]) def test_format_docstring_annotation(annotation, expected_str): From e790ad8343f53849e86178adbc7e363f3c49520e Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 Aug 2020 17:32:32 -0400 Subject: [PATCH 21/38] BUG Fixes bug in python 3.6 --- sklearn/utils/_typing.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index fef0ab47e101d..20e56b135f494 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -45,11 +45,15 @@ def _format_docstring_annotation(annotation): values = [_format_docstring_annotation(t) for t in annotation.__args__] if len(values) == 2: return ' or '.join(values) - # greater than 2 first = ', '.join(values[:-1]) return f'{first} or {values[-1]}' elif class_name == 'Literal': - items = [repr(t) for t in annotation.__args__] + if hasattr(annotation, '__values__'): + # For Python == 3.6 support + args = annotation.__values__ + else: + args = annotation.__args__ + items = [repr(t) for t in args] if len(items) == 1: return items[0] values = ', '.join(items) From 74c58e1e92760f31ac819212ad4ca78df676fdce Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 Aug 2020 20:22:07 -0400 Subject: [PATCH 22/38] FIX Adds PolynomialCountSketch to ignore list --- sklearn/tests/test_docstring_types.py | 73 +++++++++++++-------------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/sklearn/tests/test_docstring_types.py b/sklearn/tests/test_docstring_types.py index 59821ff5b9b56..9885ea55fde40 100644 --- a/sklearn/tests/test_docstring_types.py +++ b/sklearn/tests/test_docstring_types.py @@ -6,18 +6,17 @@ TYPING_IGNORED = { 'ARDRegression', 'AdaBoostClassifier', 'AdaBoostRegressor', - 'AdditiveChi2Sampler', 'AffinityPropagation', - 'AgglomerativeClustering', 'BaggingClassifier', 'BaggingRegressor', - 'BayesianGaussianMixture', 'BayesianRidge', 'BernoulliNB', - 'BernoulliRBM', 'Binarizer', 'Birch', 'CCA', 'CalibratedClassifierCV', - 'CategoricalNB', 'ClassifierChain', 'ColumnTransformer', - 'ComplementNB', 'CountVectorizer', 'DBSCAN', 'DecisionTreeClassifier', - 'DecisionTreeRegressor', 'DictVectorizer', 'DictionaryLearning', - 'DummyClassifier', 'DummyRegressor', 'ElasticNet', 'ElasticNetCV', - 'EllipticEnvelope', 'EmpiricalCovariance', 'ExtraTreeClassifier', - 'ExtraTreeRegressor', 'ExtraTreesClassifier', 'ExtraTreesRegressor', - 'FactorAnalysis', 'FastICA', 'FeatureAgglomeration', 'FeatureHasher', - 'FeatureUnion', 'FunctionTransformer', 'GammaRegressor', + 'AdditiveChi2Sampler', 'AffinityPropagation', 'AgglomerativeClustering', + 'BaggingClassifier', 'BaggingRegressor', 'BayesianGaussianMixture', + 'BayesianRidge', 'BernoulliNB', 'BernoulliRBM', 'Binarizer', 'Birch', + 'CCA', 'CalibratedClassifierCV', 'CategoricalNB', 'ClassifierChain', + 'ColumnTransformer', 'ComplementNB', 'CountVectorizer', 'DBSCAN', + 'DecisionTreeClassifier', 'DecisionTreeRegressor', 'DictVectorizer', + 'DictionaryLearning', 'DummyClassifier', 'DummyRegressor', 'ElasticNet', + 'ElasticNetCV', 'EllipticEnvelope', 'EmpiricalCovariance', + 'ExtraTreeClassifier', 'ExtraTreeRegressor', 'ExtraTreesClassifier', + 'ExtraTreesRegressor', 'FactorAnalysis', 'FastICA', 'FeatureAgglomeration', + 'FeatureHasher', 'FeatureUnion', 'FunctionTransformer', 'GammaRegressor', 'GaussianMixture', 'GaussianNB', 'GaussianProcessClassifier', 'GaussianProcessRegressor', 'GaussianRandomProjection', 'GenericUnivariateSelect', 'GradientBoostingClassifier', @@ -30,26 +29,24 @@ 'KernelDensity', 'KernelPCA', 'KernelRidge', 'LabelBinarizer', 'LabelEncoder', 'LabelPropagation', 'LabelSpreading', 'Lars', 'LarsCV', 'Lasso', 'LassoCV', 'LassoLars', 'LassoLarsCV', 'LassoLarsIC', - 'LatentDirichletAllocation', 'LedoitWolf', - 'LinearDiscriminantAnalysis', 'LinearRegression', 'LinearSVC', - 'LinearSVR', 'LocalOutlierFactor', 'LocallyLinearEmbedding', - 'LogisticRegressionCV', 'MDS', 'MLPClassifier', - 'MLPRegressor', 'MaxAbsScaler', 'MeanShift', 'MinCovDet', - 'MinMaxScaler', 'MiniBatchDictionaryLearning', 'MiniBatchKMeans', - 'MiniBatchSparsePCA', 'MissingIndicator', 'MultiLabelBinarizer', - 'MultiOutputClassifier', 'MultiOutputRegressor', 'MultiTaskElasticNet', - 'MultiTaskElasticNetCV', 'MultiTaskLasso', 'MultiTaskLassoCV', - 'MultinomialNB', 'NMF', 'NearestCentroid', 'NearestNeighbors', - 'NeighborhoodComponentsAnalysis', 'Normalizer', 'NuSVC', 'NuSVR', - 'Nystroem', 'OAS', 'OPTICS', 'OneClassSVM', 'OneHotEncoder', - 'OneVsOneClassifier', 'OneVsRestClassifier', 'OrdinalEncoder', - 'OrthogonalMatchingPursuit', 'OrthogonalMatchingPursuitCV', - 'OutputCodeClassifier', 'PCA', 'PLSCanonical', 'PLSRegression', - 'PLSSVD', 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', + 'LatentDirichletAllocation', 'LedoitWolf', 'LinearDiscriminantAnalysis', + 'LinearRegression', 'LinearSVC', 'LinearSVR', 'LocalOutlierFactor', + 'LocallyLinearEmbedding', 'LogisticRegressionCV', 'MDS', 'MLPClassifier', + 'MLPRegressor', 'MaxAbsScaler', 'MeanShift', 'MinCovDet', 'MinMaxScaler', + 'MiniBatchDictionaryLearning', 'MiniBatchKMeans', 'MiniBatchSparsePCA', + 'MissingIndicator', 'MultiLabelBinarizer', 'MultiOutputClassifier', + 'MultiOutputRegressor', 'MultiTaskElasticNet', 'MultiTaskElasticNetCV', + 'MultiTaskLasso', 'MultiTaskLassoCV', 'MultinomialNB', 'NMF', + 'NearestCentroid', 'NearestNeighbors', 'NeighborhoodComponentsAnalysis', + 'Normalizer', 'NuSVC', 'NuSVR', 'Nystroem', 'OAS', 'OPTICS', 'OneClassSVM', + 'OneHotEncoder', 'OneVsOneClassifier', 'OneVsRestClassifier', + 'OrdinalEncoder', 'OrthogonalMatchingPursuit', + 'OrthogonalMatchingPursuitCV', 'OutputCodeClassifier', + 'PolynomialCountSketch', 'PCA', 'PLSCanonical', 'PLSRegression', 'PLSSVD', + 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', 'PatchExtractor', 'Perceptron', 'Pipeline', 'PoissonRegressor', - 'PolynomialFeatures', 'PowerTransformer', - 'QuadraticDiscriminantAnalysis', 'QuantileTransformer', - 'RANSACRegressor', 'RBFSampler', 'RFE', 'RFECV', + 'PolynomialFeatures', 'PowerTransformer', 'QuadraticDiscriminantAnalysis', + 'QuantileTransformer', 'RANSACRegressor', 'RBFSampler', 'RFE', 'RFECV', 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', 'RadiusNeighborsTransformer', 'RandomForestClassifier', 'RandomForestRegressor', 'RandomTreesEmbedding', 'RandomizedSearchCV', @@ -57,13 +54,13 @@ 'RidgeClassifierCV', 'RobustScaler', 'SGDClassifier', 'SGDRegressor', 'SVC', 'SVR', 'SelectFdr', 'SelectFpr', 'SelectFromModel', 'SelectFwe', 'SelectKBest', 'SelectPercentile', 'ShrunkCovariance', 'SimpleImputer', - 'SkewedChi2Sampler', 'SparseCoder', 'SparsePCA', - 'SparseRandomProjection', 'SpectralBiclustering', 'SpectralClustering', - 'SpectralCoclustering', 'SpectralEmbedding', 'StackingClassifier', - 'StackingRegressor', 'StandardScaler', 'TSNE', 'TfidfTransformer', - 'TfidfVectorizer', 'TheilSenRegressor', 'TransformedTargetRegressor', - 'TruncatedSVD', 'TweedieRegressor', 'VarianceThreshold', - 'VotingClassifier', 'VotingRegressor' + 'SkewedChi2Sampler', 'SparseCoder', 'SparsePCA', 'SparseRandomProjection', + 'SpectralBiclustering', 'SpectralClustering', 'SpectralCoclustering', + 'SpectralEmbedding', 'StackingClassifier', 'StackingRegressor', + 'StandardScaler', 'TSNE', 'TfidfTransformer', 'TfidfVectorizer', + 'TheilSenRegressor', 'TransformedTargetRegressor', 'TruncatedSVD', + 'TweedieRegressor', 'VarianceThreshold', 'VotingClassifier', + 'VotingRegressor' } From 8e0ee080965fb4ddebaec52305ee7c7c3ab40fce Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Tue, 18 Aug 2020 20:42:54 -0400 Subject: [PATCH 23/38] CLN Make it easier to merge with future PRs --- sklearn/tests/test_docstring_types.py | 251 ++++++++++++++++++++------ 1 file changed, 195 insertions(+), 56 deletions(-) diff --git a/sklearn/tests/test_docstring_types.py b/sklearn/tests/test_docstring_types.py index 9885ea55fde40..ee925aba4e322 100644 --- a/sklearn/tests/test_docstring_types.py +++ b/sklearn/tests/test_docstring_types.py @@ -5,62 +5,201 @@ TYPING_IGNORED = { - 'ARDRegression', 'AdaBoostClassifier', 'AdaBoostRegressor', - 'AdditiveChi2Sampler', 'AffinityPropagation', 'AgglomerativeClustering', - 'BaggingClassifier', 'BaggingRegressor', 'BayesianGaussianMixture', - 'BayesianRidge', 'BernoulliNB', 'BernoulliRBM', 'Binarizer', 'Birch', - 'CCA', 'CalibratedClassifierCV', 'CategoricalNB', 'ClassifierChain', - 'ColumnTransformer', 'ComplementNB', 'CountVectorizer', 'DBSCAN', - 'DecisionTreeClassifier', 'DecisionTreeRegressor', 'DictVectorizer', - 'DictionaryLearning', 'DummyClassifier', 'DummyRegressor', 'ElasticNet', - 'ElasticNetCV', 'EllipticEnvelope', 'EmpiricalCovariance', - 'ExtraTreeClassifier', 'ExtraTreeRegressor', 'ExtraTreesClassifier', - 'ExtraTreesRegressor', 'FactorAnalysis', 'FastICA', 'FeatureAgglomeration', - 'FeatureHasher', 'FeatureUnion', 'FunctionTransformer', 'GammaRegressor', - 'GaussianMixture', 'GaussianNB', 'GaussianProcessClassifier', - 'GaussianProcessRegressor', 'GaussianRandomProjection', - 'GenericUnivariateSelect', 'GradientBoostingClassifier', - 'GradientBoostingRegressor', 'GraphicalLasso', 'GraphicalLassoCV', - 'GridSearchCV', 'HashingVectorizer', 'HistGradientBoostingClassifier', - 'HistGradientBoostingRegressor', 'HuberRegressor', 'IncrementalPCA', - 'IsolationForest', 'Isomap', 'IsotonicRegression', 'IterativeImputer', - 'KBinsDiscretizer', 'KMeans', 'KNNImputer', 'KNeighborsClassifier', - 'KNeighborsRegressor', 'KNeighborsTransformer', 'KernelCenterer', - 'KernelDensity', 'KernelPCA', 'KernelRidge', 'LabelBinarizer', - 'LabelEncoder', 'LabelPropagation', 'LabelSpreading', 'Lars', 'LarsCV', - 'Lasso', 'LassoCV', 'LassoLars', 'LassoLarsCV', 'LassoLarsIC', - 'LatentDirichletAllocation', 'LedoitWolf', 'LinearDiscriminantAnalysis', - 'LinearRegression', 'LinearSVC', 'LinearSVR', 'LocalOutlierFactor', - 'LocallyLinearEmbedding', 'LogisticRegressionCV', 'MDS', 'MLPClassifier', - 'MLPRegressor', 'MaxAbsScaler', 'MeanShift', 'MinCovDet', 'MinMaxScaler', - 'MiniBatchDictionaryLearning', 'MiniBatchKMeans', 'MiniBatchSparsePCA', - 'MissingIndicator', 'MultiLabelBinarizer', 'MultiOutputClassifier', - 'MultiOutputRegressor', 'MultiTaskElasticNet', 'MultiTaskElasticNetCV', - 'MultiTaskLasso', 'MultiTaskLassoCV', 'MultinomialNB', 'NMF', - 'NearestCentroid', 'NearestNeighbors', 'NeighborhoodComponentsAnalysis', - 'Normalizer', 'NuSVC', 'NuSVR', 'Nystroem', 'OAS', 'OPTICS', 'OneClassSVM', - 'OneHotEncoder', 'OneVsOneClassifier', 'OneVsRestClassifier', - 'OrdinalEncoder', 'OrthogonalMatchingPursuit', - 'OrthogonalMatchingPursuitCV', 'OutputCodeClassifier', - 'PolynomialCountSketch', 'PCA', 'PLSCanonical', 'PLSRegression', 'PLSSVD', - 'PassiveAggressiveClassifier', 'PassiveAggressiveRegressor', - 'PatchExtractor', 'Perceptron', 'Pipeline', 'PoissonRegressor', - 'PolynomialFeatures', 'PowerTransformer', 'QuadraticDiscriminantAnalysis', - 'QuantileTransformer', 'RANSACRegressor', 'RBFSampler', 'RFE', 'RFECV', - 'RadiusNeighborsClassifier', 'RadiusNeighborsRegressor', - 'RadiusNeighborsTransformer', 'RandomForestClassifier', - 'RandomForestRegressor', 'RandomTreesEmbedding', 'RandomizedSearchCV', - 'RegressorChain', 'Ridge', 'RidgeCV', 'RidgeClassifier', - 'RidgeClassifierCV', 'RobustScaler', 'SGDClassifier', 'SGDRegressor', - 'SVC', 'SVR', 'SelectFdr', 'SelectFpr', 'SelectFromModel', 'SelectFwe', - 'SelectKBest', 'SelectPercentile', 'ShrunkCovariance', 'SimpleImputer', - 'SkewedChi2Sampler', 'SparseCoder', 'SparsePCA', 'SparseRandomProjection', - 'SpectralBiclustering', 'SpectralClustering', 'SpectralCoclustering', - 'SpectralEmbedding', 'StackingClassifier', 'StackingRegressor', - 'StandardScaler', 'TSNE', 'TfidfTransformer', 'TfidfVectorizer', - 'TheilSenRegressor', 'TransformedTargetRegressor', 'TruncatedSVD', - 'TweedieRegressor', 'VarianceThreshold', 'VotingClassifier', - 'VotingRegressor' + "ARDRegression", + "AdaBoostClassifier", + "AdaBoostRegressor", + "AdditiveChi2Sampler", + "AffinityPropagation", + "AgglomerativeClustering", + "BaggingClassifier", + "BaggingRegressor", + "BayesianGaussianMixture", + "BayesianRidge", + "BernoulliNB", + "BernoulliRBM", + "Binarizer", + "Birch", + "CCA", + "CalibratedClassifierCV", + "CategoricalNB", + "ClassifierChain", + "ColumnTransformer", + "ComplementNB", + "CountVectorizer", + "DBSCAN", + "DecisionTreeClassifier", + "DecisionTreeRegressor", + "DictVectorizer", + "DictionaryLearning", + "DummyClassifier", + "DummyRegressor", + "ElasticNet", + "ElasticNetCV", + "EllipticEnvelope", + "EmpiricalCovariance", + "ExtraTreeClassifier", + "ExtraTreeRegressor", + "ExtraTreesClassifier", + "ExtraTreesRegressor", + "FactorAnalysis", + "FastICA", + "FeatureAgglomeration", + "FeatureHasher", + "FeatureUnion", + "FunctionTransformer", + "GammaRegressor", + "GaussianMixture", + "GaussianNB", + "GaussianProcessClassifier", + "GaussianProcessRegressor", + "GaussianRandomProjection", + "GenericUnivariateSelect", + "GradientBoostingClassifier", + "GradientBoostingRegressor", + "GraphicalLasso", + "GraphicalLassoCV", + "GridSearchCV", + "HashingVectorizer", + "HistGradientBoostingClassifier", + "HistGradientBoostingRegressor", + "HuberRegressor", + "IncrementalPCA", + "IsolationForest", + "Isomap", + "IsotonicRegression", + "IterativeImputer", + "KBinsDiscretizer", + "KMeans", + "KNNImputer", + "KNeighborsClassifier", + "KNeighborsRegressor", + "KNeighborsTransformer", + "KernelCenterer", + "KernelDensity", + "KernelPCA", + "KernelRidge", + "LabelBinarizer", + "LabelEncoder", + "LabelPropagation", + "LabelSpreading", + "Lars", + "LarsCV", + "Lasso", + "LassoCV", + "LassoLars", + "LassoLarsCV", + "LassoLarsIC", + "LatentDirichletAllocation", + "LedoitWolf", + "LinearDiscriminantAnalysis", + "LinearRegression", + "LinearSVC", + "LinearSVR", + "LocalOutlierFactor", + "LocallyLinearEmbedding", + "LogisticRegressionCV", + "MDS", + "MLPClassifier", + "MLPRegressor", + "MaxAbsScaler", + "MeanShift", + "MinCovDet", + "MinMaxScaler", + "MiniBatchDictionaryLearning", + "MiniBatchKMeans", + "MiniBatchSparsePCA", + "MissingIndicator", + "MultiLabelBinarizer", + "MultiOutputClassifier", + "MultiOutputRegressor", + "MultiTaskElasticNet", + "MultiTaskElasticNetCV", + "MultiTaskLasso", + "MultiTaskLassoCV", + "MultinomialNB", + "NMF", + "NearestCentroid", + "NearestNeighbors", + "NeighborhoodComponentsAnalysis", + "Normalizer", + "NuSVC", + "NuSVR", + "Nystroem", + "OAS", + "OPTICS", + "OneClassSVM", + "OneHotEncoder", + "OneVsOneClassifier", + "OneVsRestClassifier", + "OrdinalEncoder", + "OrthogonalMatchingPursuit", + "OrthogonalMatchingPursuitCV", + "OutputCodeClassifier", + "PolynomialCountSketch", + "PCA", + "PLSCanonical", + "PLSRegression", + "PLSSVD", + "PassiveAggressiveClassifier", + "PassiveAggressiveRegressor", + "PatchExtractor", + "Perceptron", + "Pipeline", + "PoissonRegressor", + "PolynomialFeatures", + "PowerTransformer", + "QuadraticDiscriminantAnalysis", + "QuantileTransformer", + "RANSACRegressor", + "RBFSampler", + "RFE", + "RFECV", + "RadiusNeighborsClassifier", + "RadiusNeighborsRegressor", + "RadiusNeighborsTransformer", + "RandomForestClassifier", + "RandomForestRegressor", + "RandomTreesEmbedding", + "RandomizedSearchCV", + "RegressorChain", + "Ridge", + "RidgeCV", + "RidgeClassifier", + "RidgeClassifierCV", + "RobustScaler", + "SGDClassifier", + "SGDRegressor", + "SVC", + "SVR", + "SelectFdr", + "SelectFpr", + "SelectFromModel", + "SelectFwe", + "SelectKBest", + "SelectPercentile", + "ShrunkCovariance", + "SimpleImputer", + "SkewedChi2Sampler", + "SparseCoder", + "SparsePCA", + "SparseRandomProjection", + "SpectralBiclustering", + "SpectralClustering", + "SpectralCoclustering", + "SpectralEmbedding", + "StackingClassifier", + "StackingRegressor", + "StandardScaler", + "TSNE", + "TfidfTransformer", + "TfidfVectorizer", + "TheilSenRegressor", + "TransformedTargetRegressor", + "TruncatedSVD", + "TweedieRegressor", + "VarianceThreshold", + "VotingClassifier", + "VotingRegressor", } From 667518e528f1f6a6d598447c1013b0bf54658725 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 27 Aug 2020 14:57:48 -0400 Subject: [PATCH 24/38] TST Uses double quotes for strings --- sklearn/utils/tests/test_typing.py | 42 +++++++++++++++--------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index f38ea0b1f708a..2f13bd5a97632 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -36,21 +36,21 @@ def test_get_annotation_class_name(annotation, expected_class): @pytest.mark.parametrize("annotation, expected_str", [ - (None, 'None'), - (BaseEstimator, 'estimator instance'), - (np.random.RandomState, 'RandomState instance'), - (int, 'int'), + (None, "None"), + (BaseEstimator, "estimator instance"), + (np.random.RandomState, "RandomState instance"), + (int, "int"), (float, 'float'), - (list, 'list'), - (str, 'str'), - (List[int], 'list of int'), - (Optional[List[int]], 'list of int or None'), - (List[BaseEstimator], 'list of estimator instance'), - (Optional[BaseEstimator], 'estimator instance or None'), - (Union[int, float], 'int or float'), - (Literal['cat', 'dog'], '{\'cat\', \'dog\'}'), - (Literal['cat'], '\'cat\''), - (RandomState, 'int, RandomState instance or None') + (list, "list"), + (str, "str"), + (List[int], "list of int"), + (Optional[List[int]], "list of int or None"), + (List[BaseEstimator], "list of estimator instance"), + (Optional[BaseEstimator], "estimator instance or None"), + (Union[int, float], "int or float"), + (Literal['cat', 'dog'], "{'cat', 'dog'}"), + (Literal['cat'], "'cat'"), + (RandomState, "int, RandomState instance or None") ]) def test_format_docstring_annotation(annotation, expected_str): assert _format_docstring_annotation(annotation) == expected_str @@ -68,10 +68,10 @@ def __init__(self, def test_get_docstring_annotations(): annotations = get_docstring_annotations(TestObject.__init__) - assert annotations['estimator'] == 'estimator instance' - assert annotations['num'] == 'int, default=10' - assert annotations['float_num'] == 'float, default=' - assert annotations['union_num'] == 'int or float, default=' - assert annotations['pet'] == '{\'cat\', \'dog\'}, default=\'dog\'' - assert annotations['random_state'] == ('int, RandomState instance or None' - ', default=None') + assert annotations['estimator'] == "estimator instance" + assert annotations['num'] == "int, default=10" + assert annotations['float_num'] == "float, default=" + assert annotations['union_num'] == "int or float, default=" + assert annotations['pet'] == "{'cat', 'dog'}, default='dog'" + assert annotations['random_state'] == ("int, RandomState instance or None" + ", default=None") From 6ead295818a5bee67400c9565ec94054069a8d79 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 27 Aug 2020 22:37:00 -0400 Subject: [PATCH 25/38] MNT Update license formating for github to recognize --- COPYING | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/COPYING b/COPYING index b98af18710185..415d7dbe8a2d3 100644 --- a/COPYING +++ b/COPYING @@ -1,32 +1,29 @@ -New BSD License +BSD 3-Clause License -Copyright (c) 2007–2020 The scikit-learn developers. +Copyright (c) 2007-2020, The scikit-learn developers. All rights reserved. - Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - a. Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - b. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - c. Neither the name of the Scikit-learn Developers nor the names of - its contributors may be used to endorse or promote products - derived from this software without specific prior written - permission. +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH -DAMAGE. - +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. From c2b9a37ec0acb3774a1784389cabcc95f6da7f79 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 27 Aug 2020 22:38:08 -0400 Subject: [PATCH 26/38] MNT Rename license file --- COPYING => LICENSE.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename COPYING => LICENSE.txt (100%) diff --git a/COPYING b/LICENSE.txt similarity index 100% rename from COPYING rename to LICENSE.txt From b6a5200bcc08fa9d39d4a3791cdbb5aa4412baa5 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 27 Aug 2020 22:41:40 -0400 Subject: [PATCH 27/38] Revert "MNT Rename license file" This reverts commit c2b9a37ec0acb3774a1784389cabcc95f6da7f79. --- LICENSE.txt => COPYING | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename LICENSE.txt => COPYING (100%) diff --git a/LICENSE.txt b/COPYING similarity index 100% rename from LICENSE.txt rename to COPYING From f2e73b47dc10c270e74bf15460c3a89546f8f95f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Thu, 27 Aug 2020 22:42:20 -0400 Subject: [PATCH 28/38] MNT Less diffs --- COPYING | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/COPYING b/COPYING index 415d7dbe8a2d3..558c4c1245615 100644 --- a/COPYING +++ b/COPYING @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2007-2020, The scikit-learn developers. +Copyright (c) 2007-2020 The scikit-learn developers. All rights reserved. Redistribution and use in source and binary forms, with or without From c491c98c6c2a258f8719e641854ff92a2d2d3b7c Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 28 Aug 2020 15:02:44 -0400 Subject: [PATCH 29/38] TST Ignores SequentialFeatureSelector --- sklearn/tests/test_docstring_types.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sklearn/tests/test_docstring_types.py b/sklearn/tests/test_docstring_types.py index ee925aba4e322..d97f2f3e7e019 100644 --- a/sklearn/tests/test_docstring_types.py +++ b/sklearn/tests/test_docstring_types.py @@ -177,6 +177,7 @@ "SelectFwe", "SelectKBest", "SelectPercentile", + "SequentialFeatureSelector", "ShrunkCovariance", "SimpleImputer", "SkewedChi2Sampler", From 8d2aa0e2daa17aa7a2067004425be37e3f5da921 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 29 Aug 2020 13:22:11 -0400 Subject: [PATCH 30/38] CLN Move Literal into fixes --- sklearn/linear_model/_logistic.py | 2 +- sklearn/utils/fixes.py | 6 ++++++ sklearn/utils/tests/test_typing.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 019b53be0b91e..c5645d9231b91 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -14,7 +14,6 @@ import warnings from typing import Union from typing import Optional -from typing_extensions import Literal import numpy as np from scipy import optimize, sparse @@ -36,6 +35,7 @@ from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args +from ..utils.fixes import Literal from ..model_selection import check_cv from ..metrics import get_scorer diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index d715584665ab6..37bb989f7799a 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -27,6 +27,12 @@ # setuptools not installed parse_version = LooseVersion # type: ignore +try: + from typing import Literal # noqa +except ImportError: + # Python < 3.8 + from typing_extension import Literal # type: ignore # noqa + np_version = parse_version(np.__version__) sp_version = parse_version(scipy.__version__) diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index 2f13bd5a97632..62ea1dfdbb1f3 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -4,12 +4,12 @@ from typing import Union from typing import Callable from typing import Optional -from typing_extensions import Literal import pytest import numpy as np from sklearn.base import BaseEstimator +from sklearn.utils.fixes import Literal from sklearn.utils._typing import RandomState from sklearn.utils._typing import _get_annotation_class_name from sklearn.utils._typing import _format_docstring_annotation From 2378769a1fdf834c0425505876c8f93857b886d0 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sat, 29 Aug 2020 14:50:43 -0400 Subject: [PATCH 31/38] FIX Fixes spelling of typing_extensions --- sklearn/utils/fixes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 37bb989f7799a..501e216047404 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -31,7 +31,7 @@ from typing import Literal # noqa except ImportError: # Python < 3.8 - from typing_extension import Literal # type: ignore # noqa + from typing_extensions import Literal # type: ignore # noqa np_version = parse_version(np.__version__) From 67c4574a251a40e497310d3db7148ab9d74aeebb Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 31 Aug 2020 18:53:20 -0400 Subject: [PATCH 32/38] MNT Soft dependency on typing_extensions --- build_tools/azure/install.sh | 3 +- build_tools/get_formatted_docstring_types.py | 2 - setup.cfg | 4 + sklearn/_build_utils/min_dependencies.py | 2 +- sklearn/linear_model/_logistic.py | 12 +- sklearn/utils/_typing.py | 102 +--------------- sklearn/utils/fixes.py | 6 - sklearn/utils/tests/test_typing.py | 115 +++++++++++++++++-- 8 files changed, 124 insertions(+), 122 deletions(-) diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 9fdb1a3cd3a31..873f1acf1a53e 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -82,7 +82,8 @@ fi python -m pip install $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \ $(get_dep pytest $PYTEST_VERSION) \ - $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) + $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) \ + typing-extensions if [[ "$COVERAGE" == "true" ]]; then python -m pip install codecov pytest-cov diff --git a/build_tools/get_formatted_docstring_types.py b/build_tools/get_formatted_docstring_types.py index 03320a926fb33..a2ab096398e3e 100644 --- a/build_tools/get_formatted_docstring_types.py +++ b/build_tools/get_formatted_docstring_types.py @@ -3,10 +3,8 @@ import argparse import inspect - from sklearn.utils._typing import get_docstring_annotations - parser = argparse.ArgumentParser( description=("Generates typed docstring for a specific scikit-learn " "class or function")) diff --git a/setup.cfg b/setup.cfg index 1a09d8872e9b1..ccdc5783b5a5c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,3 +28,7 @@ artifact_indexes= [flake8] # Default flake8 3.5 ignored flags ignore=E121,E123,E126,E226,E24,E704,W503,W504 + +[mypy-sklearn.externals.*] +check_untyped_defs=False +ignore_errors=True diff --git a/sklearn/_build_utils/min_dependencies.py b/sklearn/_build_utils/min_dependencies.py index 03436ea5a0ba9..6f419ba51941b 100644 --- a/sklearn/_build_utils/min_dependencies.py +++ b/sklearn/_build_utils/min_dependencies.py @@ -40,7 +40,7 @@ 'sphinx-gallery': ('0.7.0', 'docs'), 'numpydoc': ('1.0.0', 'docs'), 'Pillow': ('7.1.2', 'docs'), - 'typing-extensions': ('3.7.4', 'build, install') + 'typing-extensions': ('3.7.4', 'tests') } diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index c5645d9231b91..61ac5f5c59db5 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -35,7 +35,7 @@ from ..utils.validation import _deprecate_positional_args from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args -from ..utils.fixes import Literal +from ..utils._typing import Literal from ..model_selection import check_cv from ..metrics import get_scorer @@ -1261,19 +1261,19 @@ class LogisticRegression(LinearClassifierMixin, @_deprecate_positional_args def __init__(self, - penalty: Literal['l1', 'l2', 'elasticnet', 'none'] = 'l2', + penalty: "Literal['l1', 'l2', 'elasticnet', 'none']" = 'l2', *, dual: bool = False, tol: float = 1e-4, C: float = 1.0, fit_intercept: bool = True, intercept_scaling: float = 1, - class_weight: Union[dict, Literal['balanced']] = None, + class_weight: Union[dict, "Literal['balanced']"] = None, random_state: RandomState = None, - solver: Literal['newton-cg', 'lbfgs', 'liblinear', 'sag', - 'saga'] = 'lbfgs', + solver: "Literal['newton-cg', 'lbfgs', 'liblinear', 'sag', " + "'saga']" = 'lbfgs', max_iter: int = 100, - multi_class: Literal['auto', 'ovr', 'multinomial'] = 'auto', + multi_class: "Literal['auto', 'ovr', 'multinomial']" = 'auto', verbose: int = 0, warm_start: bool = False, n_jobs: Optional[int] = None, diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 20e56b135f494..59f9324891e41 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -1,103 +1,11 @@ -import inspect -import numbers - +from typing import TYPE_CHECKING from typing import Union -from typing import Any import numpy as np - RandomState = Union[int, np.random.RandomState, None] - -def _get_annotation_class_name(annotation): - """Get class name for annnotation""" - if annotation is None: - return 'None' - elif annotation is Any: - return 'Any' - - if getattr(annotation, '__qualname__', None): - return annotation.__qualname__ - elif getattr(annotation, '_name', None): - # generic for >= 3.7 - return annotation._name - - origin = getattr(annotation, '__origin__', None) - if origin: - return _get_annotation_class_name(annotation.__origin__) - - # generic for < 3.7 (Literal) - return annotation.__class__.__qualname__.lstrip('_') - - -def _format_docstring_annotation(annotation): - """Convert annotation to docstring.""" - class_name = _get_annotation_class_name(annotation) - - if class_name == 'BaseEstimator': - return 'estimator instance' - elif class_name == 'NoneType': - return 'None' - elif class_name == 'RandomState': - return 'RandomState instance' - elif class_name == 'Union': - values = [_format_docstring_annotation(t) for t in annotation.__args__] - if len(values) == 2: - return ' or '.join(values) - first = ', '.join(values[:-1]) - return f'{first} or {values[-1]}' - elif class_name == 'Literal': - if hasattr(annotation, '__values__'): - # For Python == 3.6 support - args = annotation.__values__ - else: - args = annotation.__args__ - items = [repr(t) for t in args] - if len(items) == 1: - return items[0] - values = ', '.join(items) - return f'{{{values}}}' - elif class_name == 'List': - values = ', '.join(_format_docstring_annotation(t) - for t in annotation.__args__) - return f'list of {values}' - - return class_name - - -def get_docstring_annotations(obj): - """Get human readable docstring for types for a obj with annotations. - - Parameters - ---------- - obj: object - - Returns - ------- - output: dict - dictionary mapping from name to human-readable docstring. - """ - if not hasattr(obj, '__annotations__'): - return {} - - annotations = obj.__annotations__ - # get defaults - params = inspect.signature(obj).parameters - defaults = {p: v.default for p, v in params.items() - if v.default != inspect.Parameter.empty} - - output = {} - for name, annotation in annotations.items(): - anno = _format_docstring_annotation(annotation) - if name in defaults: - default = defaults[name] - if (isinstance(default, numbers.Real) and - not isinstance(default, numbers.Integral)): - # For floats the representation can vary, i.e: - # default=np.inf or default=1e-4 - anno += ", default=" - else: - anno += f", default={repr(default)}" - output[name] = anno - return output +if TYPE_CHECKING: + from typing_extensions import Literal # type: ignore # noqa +else: + Literal = None diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py index 501e216047404..d715584665ab6 100644 --- a/sklearn/utils/fixes.py +++ b/sklearn/utils/fixes.py @@ -27,12 +27,6 @@ # setuptools not installed parse_version = LooseVersion # type: ignore -try: - from typing import Literal # noqa -except ImportError: - # Python < 3.8 - from typing_extensions import Literal # type: ignore # noqa - np_version = parse_version(np.__version__) sp_version = parse_version(scipy.__version__) diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index 62ea1dfdbb1f3..0ccde7ea6e742 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -1,3 +1,6 @@ +import inspect +import numbers +import typing from typing import Dict from typing import Any from typing import List @@ -9,11 +12,103 @@ import numpy as np from sklearn.base import BaseEstimator -from sklearn.utils.fixes import Literal from sklearn.utils._typing import RandomState -from sklearn.utils._typing import _get_annotation_class_name -from sklearn.utils._typing import _format_docstring_annotation -from sklearn.utils._typing import get_docstring_annotations +from sklearn.utils._typing import Literal + + +def _get_annotation_class_name(annotation): + """Get class name for annnotation""" + if annotation is None: + return 'None' + elif annotation is Any: + return 'Any' + + if getattr(annotation, '__qualname__', None): + return annotation.__qualname__ + elif getattr(annotation, '_name', None): + # generic for >= 3.7 + return annotation._name + + origin = getattr(annotation, '__origin__', None) + if origin: + return _get_annotation_class_name(annotation.__origin__) + + # generic for < 3.7 (Literal) + return annotation.__class__.__qualname__.lstrip('_') + + +def _format_docstring_annotation(annotation): + """Convert annotation to docstring.""" + class_name = _get_annotation_class_name(annotation) + + if class_name == 'BaseEstimator': + return 'estimator instance' + elif class_name == 'NoneType': + return 'None' + elif class_name == 'RandomState': + return 'RandomState instance' + elif class_name == 'Union': + values = [_format_docstring_annotation(t) for t in annotation.__args__] + if len(values) == 2: + return ' or '.join(values) + first = ', '.join(values[:-1]) + return f'{first} or {values[-1]}' + elif class_name == 'Literal': + if hasattr(annotation, '__values__'): + # For Python == 3.6 support + args = annotation.__values__ + else: + args = annotation.__args__ + items = [repr(t) for t in args] + if len(items) == 1: + return items[0] + values = ', '.join(items) + return f'{{{values}}}' + elif class_name == 'List': + values = ', '.join(_format_docstring_annotation(t) + for t in annotation.__args__) + return f'list of {values}' + + return class_name + + +def get_docstring_annotations(obj): + """Get human readable docstring for types for a obj with annotations. + + Parameters + ---------- + obj: object + Object to get annotations from + + Returns + ------- + output: dict + dictionary mapping from name to human-readable docstring. + """ + if not hasattr(obj, '__annotations__'): + return {} + + from typing_extensions import Literal + annotations = typing.get_type_hints(obj, {'Literal': Literal}) + # get defaults + params = inspect.signature(obj).parameters + defaults = {p: v.default for p, v in params.items() + if v.default != inspect.Parameter.empty} + + output = {} + for name, annotation in annotations.items(): + anno = _format_docstring_annotation(annotation) + if name in defaults: + default = defaults[name] + if (isinstance(default, numbers.Real) and + not isinstance(default, numbers.Integral)): + # For floats the representation can vary, i.e: + # default=np.inf or default=1e-4 + anno += ", default=" + else: + anno += f", default={repr(default)}" + output[name] = anno + return output @pytest.mark.parametrize("annotation, expected_class", [ @@ -27,7 +122,6 @@ (List[int], 'List'), (Union[int, float], 'Union'), (Dict, 'Dict'), - (Literal['a', 'b'], 'Literal'), (Callable, 'Callable'), (Callable[[str], str], 'Callable'), ]) @@ -48,8 +142,6 @@ def test_get_annotation_class_name(annotation, expected_class): (List[BaseEstimator], "list of estimator instance"), (Optional[BaseEstimator], "estimator instance or None"), (Union[int, float], "int or float"), - (Literal['cat', 'dog'], "{'cat', 'dog'}"), - (Literal['cat'], "'cat'"), (RandomState, "int, RandomState instance or None") ]) def test_format_docstring_annotation(annotation, expected_str): @@ -61,17 +153,22 @@ def __init__(self, estimator: BaseEstimator, num: int = 10, union_num: Union[int, float] = 1.4, float_num: float = 1e-4, - pet: Literal['cat', 'dog'] = 'dog', + pet: "Literal['dog']" = 'dog', + weather: "Literal['sunny', 'cloudy']" = 'sunny', random_state: RandomState = None): pass def test_get_docstring_annotations(): + # get_docstring_annotations needs typing_extensions for Literal + pytest.importorskip("typing_extensions") annotations = get_docstring_annotations(TestObject.__init__) + assert annotations['estimator'] == "estimator instance" assert annotations['num'] == "int, default=10" assert annotations['float_num'] == "float, default=" assert annotations['union_num'] == "int or float, default=" - assert annotations['pet'] == "{'cat', 'dog'}, default='dog'" + assert annotations['pet'] == "'dog', default='dog'" + assert annotations['weather'] == "{'sunny', 'cloudy'}, default='sunny'" assert annotations['random_state'] == ("int, RandomState instance or None" ", default=None") From f05e47c0414354e3bf01638c75755f32a6dad551 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Mon, 31 Aug 2020 18:57:28 -0400 Subject: [PATCH 33/38] MNT Soft dependency on typing_extensions --- build_tools/get_formatted_docstring_types.py | 2 +- sklearn/utils/_typing.py | 102 +++++++++++++++++++ sklearn/utils/tests/test_typing.py | 101 +----------------- 3 files changed, 106 insertions(+), 99 deletions(-) diff --git a/build_tools/get_formatted_docstring_types.py b/build_tools/get_formatted_docstring_types.py index a2ab096398e3e..204bc2871aaa4 100644 --- a/build_tools/get_formatted_docstring_types.py +++ b/build_tools/get_formatted_docstring_types.py @@ -3,7 +3,7 @@ import argparse import inspect -from sklearn.utils._typing import get_docstring_annotations +from sklearn.utils.tests.test_typing import get_docstring_annotations parser = argparse.ArgumentParser( description=("Generates typed docstring for a specific scikit-learn " diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index 59f9324891e41..f24ec8e767a67 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -1,5 +1,10 @@ +import inspect +import numbers +import typing + from typing import TYPE_CHECKING from typing import Union +from typing import Any import numpy as np @@ -9,3 +14,100 @@ from typing_extensions import Literal # type: ignore # noqa else: Literal = None + + +def _get_annotation_class_name(annotation): + """Get class name for annnotation""" + if annotation is None: + return 'None' + elif annotation is Any: + return 'Any' + + if getattr(annotation, '__qualname__', None): + return annotation.__qualname__ + elif getattr(annotation, '_name', None): + # generic for >= 3.7 + return annotation._name + + origin = getattr(annotation, '__origin__', None) + if origin: + return _get_annotation_class_name(annotation.__origin__) + + # generic for < 3.7 (Literal) + return annotation.__class__.__qualname__.lstrip('_') + + +def _format_docstring_annotation(annotation): + """Convert annotation to docstring.""" + class_name = _get_annotation_class_name(annotation) + + if class_name == 'BaseEstimator': + return 'estimator instance' + elif class_name == 'NoneType': + return 'None' + elif class_name == 'RandomState': + return 'RandomState instance' + elif class_name == 'Union': + values = [_format_docstring_annotation(t) for t in annotation.__args__] + if len(values) == 2: + return ' or '.join(values) + first = ', '.join(values[:-1]) + return f'{first} or {values[-1]}' + elif class_name == 'Literal': + if hasattr(annotation, '__values__'): + # For Python == 3.6 support + args = annotation.__values__ + else: + args = annotation.__args__ + items = [repr(t) for t in args] + if len(items) == 1: + return items[0] + values = ', '.join(items) + return f'{{{values}}}' + elif class_name == 'List': + values = ', '.join(_format_docstring_annotation(t) + for t in annotation.__args__) + return f'list of {values}' + + return class_name + + +def get_docstring_annotations(obj): + """Get human readable docstring for types for a obj with annotations. + + This function requires `typing_extensions` to be installed to run. + + Parameters + ---------- + obj: object + Object to get annotations from + + Returns + ------- + output: dict + dictionary mapping from name to human-readable docstring. + """ + if not hasattr(obj, '__annotations__'): + return {} + + from typing_extensions import Literal + annotations = typing.get_type_hints(obj, {'Literal': Literal}) + # get defaults + params = inspect.signature(obj).parameters + defaults = {p: v.default for p, v in params.items() + if v.default != inspect.Parameter.empty} + + output = {} + for name, annotation in annotations.items(): + anno = _format_docstring_annotation(annotation) + if name in defaults: + default = defaults[name] + if (isinstance(default, numbers.Real) and + not isinstance(default, numbers.Integral)): + # For floats the representation can vary, i.e: + # default=np.inf or default=1e-4 + anno += ", default=" + else: + anno += f", default={repr(default)}" + output[name] = anno + return output diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index 0ccde7ea6e742..27de1c1a6458c 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -1,6 +1,3 @@ -import inspect -import numbers -import typing from typing import Dict from typing import Any from typing import List @@ -14,101 +11,9 @@ from sklearn.base import BaseEstimator from sklearn.utils._typing import RandomState from sklearn.utils._typing import Literal - - -def _get_annotation_class_name(annotation): - """Get class name for annnotation""" - if annotation is None: - return 'None' - elif annotation is Any: - return 'Any' - - if getattr(annotation, '__qualname__', None): - return annotation.__qualname__ - elif getattr(annotation, '_name', None): - # generic for >= 3.7 - return annotation._name - - origin = getattr(annotation, '__origin__', None) - if origin: - return _get_annotation_class_name(annotation.__origin__) - - # generic for < 3.7 (Literal) - return annotation.__class__.__qualname__.lstrip('_') - - -def _format_docstring_annotation(annotation): - """Convert annotation to docstring.""" - class_name = _get_annotation_class_name(annotation) - - if class_name == 'BaseEstimator': - return 'estimator instance' - elif class_name == 'NoneType': - return 'None' - elif class_name == 'RandomState': - return 'RandomState instance' - elif class_name == 'Union': - values = [_format_docstring_annotation(t) for t in annotation.__args__] - if len(values) == 2: - return ' or '.join(values) - first = ', '.join(values[:-1]) - return f'{first} or {values[-1]}' - elif class_name == 'Literal': - if hasattr(annotation, '__values__'): - # For Python == 3.6 support - args = annotation.__values__ - else: - args = annotation.__args__ - items = [repr(t) for t in args] - if len(items) == 1: - return items[0] - values = ', '.join(items) - return f'{{{values}}}' - elif class_name == 'List': - values = ', '.join(_format_docstring_annotation(t) - for t in annotation.__args__) - return f'list of {values}' - - return class_name - - -def get_docstring_annotations(obj): - """Get human readable docstring for types for a obj with annotations. - - Parameters - ---------- - obj: object - Object to get annotations from - - Returns - ------- - output: dict - dictionary mapping from name to human-readable docstring. - """ - if not hasattr(obj, '__annotations__'): - return {} - - from typing_extensions import Literal - annotations = typing.get_type_hints(obj, {'Literal': Literal}) - # get defaults - params = inspect.signature(obj).parameters - defaults = {p: v.default for p, v in params.items() - if v.default != inspect.Parameter.empty} - - output = {} - for name, annotation in annotations.items(): - anno = _format_docstring_annotation(annotation) - if name in defaults: - default = defaults[name] - if (isinstance(default, numbers.Real) and - not isinstance(default, numbers.Integral)): - # For floats the representation can vary, i.e: - # default=np.inf or default=1e-4 - anno += ", default=" - else: - anno += f", default={repr(default)}" - output[name] = anno - return output +from sklearn.utils._typing import _get_annotation_class_name +from sklearn.utils._typing import _format_docstring_annotation +from sklearn.utils._typing import get_docstring_annotations @pytest.mark.parametrize("annotation, expected_class", [ From e2334eca5476d8678b2f35548c70d325c621910b Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 Jan 2021 20:16:35 -0500 Subject: [PATCH 34/38] ENH Simplify handling of typing --- sklearn/linear_model/_logistic.py | 19 ++--- sklearn/utils/_typing.py | 111 ++++++++++++++++++++--------- sklearn/utils/tests/test_typing.py | 40 +++++------ 3 files changed, 104 insertions(+), 66 deletions(-) diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 0a35b2da09329..53e9a21fa0393 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -20,7 +20,6 @@ from scipy.special import expit, logsumexp from joblib import Parallel, effective_n_jobs -from ..utils._typing import RandomState from ._base import LinearClassifierMixin, SparseCoefMixin, BaseEstimator from ._sag import sag_solver from ..preprocessing import LabelEncoder, LabelBinarizer @@ -36,6 +35,7 @@ from ..utils.multiclass import check_classification_targets from ..utils.fixes import _joblib_parallel_args from ..utils._typing import Literal +from ..utils._typing import RandomStateType from ..utils.fixes import delayed from ..model_selection import check_cv from ..metrics import get_scorer @@ -1255,26 +1255,21 @@ class LogisticRegression(LinearClassifierMixin, >>> clf.score(X, y) 0.97... """ - classes_: np.ndarray - coef_: np.ndarray - intercept_: np.ndarray - n_iter_: np.ndarray - @_deprecate_positional_args def __init__(self, - penalty: "Literal['l1', 'l2', 'elasticnet', 'none']" = 'l2', + penalty: Literal["l1", "l2", "elasticnet", "none"] = "l2", *, dual: bool = False, tol: float = 1e-4, C: float = 1.0, fit_intercept: bool = True, intercept_scaling: float = 1, - class_weight: Union[dict, "Literal['balanced']"] = None, - random_state: RandomState = None, - solver: "Literal['newton-cg', 'lbfgs', 'liblinear', 'sag', " - "'saga']" = 'lbfgs', + class_weight: Union[dict, Literal["balanced"]] = None, + random_state: RandomStateType = None, + solver: Literal["newton-cg", "lbfgs", "liblinear", "sag", + "saga"] = "lbfgs", max_iter: int = 100, - multi_class: "Literal['auto', 'ovr', 'multinomial']" = 'auto', + multi_class: Literal["auto", "ovr", "multinomial"] = "auto", verbose: int = 0, warm_start: bool = False, n_jobs: Optional[int] = None, diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index f24ec8e767a67..a281d81ebfb70 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -1,58 +1,101 @@ import inspect import numbers import typing - -from typing import TYPE_CHECKING from typing import Union -from typing import Any +from typing import TypeVar import numpy as np -RandomState = Union[int, np.random.RandomState, None] -if TYPE_CHECKING: - from typing_extensions import Literal # type: ignore # noqa +try: + import typing_extensions # noqa + + TYPING_EXTENSION_INSTALLED = True +except ImportError: + TYPING_EXTENSION_INSTALLED = False + + +if typing.TYPE_CHECKING or TYPING_EXTENSION_INSTALLED: + from typing_extensions import Literal # noqa else: - Literal = None + class _SimpleLiteral: + def __getitem__(self, values): + return typing.Any + + Literal = _SimpleLiteral() + + +ArrayLike = TypeVar("ArrayLike") +NDArray = TypeVar("NDArray") +EstimatorType = TypeVar("EstimatorType") +JoblibMemory = TypeVar("JoblibMemory") +RandomStateType = Union[int, np.random.RandomState, None] +MemoryType = Union[str, JoblibMemory, None] -def _get_annotation_class_name(annotation): - """Get class name for annnotation""" - if annotation is None: - return 'None' - elif annotation is Any: - return 'Any' - if getattr(annotation, '__qualname__', None): +def get_annotation_class_name(annotation) -> str: + # Special cases + if annotation is None or annotation is type(None): # noqa + return "None" + + if getattr(annotation, "__qualname__", None): return annotation.__qualname__ - elif getattr(annotation, '_name', None): - # generic for >= 3.7 + elif getattr( + annotation, "_name", None + ): # Required for generic aliases on Python 3.7+ return annotation._name - origin = getattr(annotation, '__origin__', None) + origin = getattr(annotation, "__origin__", None) if origin: - return _get_annotation_class_name(annotation.__origin__) + if getattr(origin, "__qualname__", None): + # Required for Protocol subclasses + return origin.__qualname__ + elif getattr(origin, "_name", None): + # Required for Union on Python 3.7+ + return origin._name + else: + return origin.__class__.__qualname__.lstrip( + "_" + ) # Required for Union on Python < 3.7 - # generic for < 3.7 (Literal) - return annotation.__class__.__qualname__.lstrip('_') + annotation_cls = (annotation + if inspect.isclass(annotation) else annotation.__class__) + return annotation_cls.__qualname__.lstrip("_") -def _format_docstring_annotation(annotation): +def format_docstring_annotation(annotation): """Convert annotation to docstring.""" - class_name = _get_annotation_class_name(annotation) - - if class_name == 'BaseEstimator': - return 'estimator instance' - elif class_name == 'NoneType': - return 'None' - elif class_name == 'RandomState': - return 'RandomState instance' + + # handle some annotations directly + if annotation == np.random.RandomState: + return "RandomState instance" + + class_name = get_annotation_class_name(annotation) + + if class_name == "None": + return "None" + elif class_name == "TypeVar": + name = annotation.__name__ + if name == "EstimatorType": + return "estimator instance" + elif name == "ArrayLike": + return "array-like" + elif name == "NDArray": + return "ndarray" + elif name == "JoblibMemory": + return "object with the joblib.Memory interface" + else: + raise ValueError(f"Unrecognized TypeVar: {annotation}") + elif class_name == 'Union': - values = [_format_docstring_annotation(t) for t in annotation.__args__] + values = [format_docstring_annotation(t) + for t in annotation.__args__] if len(values) == 2: return ' or '.join(values) first = ', '.join(values[:-1]) return f'{first} or {values[-1]}' + elif class_name == 'Literal': if hasattr(annotation, '__values__'): # For Python == 3.6 support @@ -64,8 +107,9 @@ def _format_docstring_annotation(annotation): return items[0] values = ', '.join(items) return f'{{{values}}}' + elif class_name == 'List': - values = ', '.join(_format_docstring_annotation(t) + values = ', '.join(format_docstring_annotation(t) for t in annotation.__args__) return f'list of {values}' @@ -90,8 +134,7 @@ def get_docstring_annotations(obj): if not hasattr(obj, '__annotations__'): return {} - from typing_extensions import Literal - annotations = typing.get_type_hints(obj, {'Literal': Literal}) + annotations = typing.get_type_hints(obj) # get defaults params = inspect.signature(obj).parameters defaults = {p: v.default for p, v in params.items() @@ -99,7 +142,7 @@ def get_docstring_annotations(obj): output = {} for name, annotation in annotations.items(): - anno = _format_docstring_annotation(annotation) + anno = format_docstring_annotation(annotation) if name in defaults: default = defaults[name] if (isinstance(default, numbers.Real) and diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index 27de1c1a6458c..3ae9a8482ae8e 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -1,5 +1,4 @@ from typing import Dict -from typing import Any from typing import List from typing import Union from typing import Callable @@ -8,22 +7,21 @@ import pytest import numpy as np -from sklearn.base import BaseEstimator -from sklearn.utils._typing import RandomState +from sklearn.utils._typing import RandomStateType +from sklearn.utils._typing import EstimatorType from sklearn.utils._typing import Literal -from sklearn.utils._typing import _get_annotation_class_name -from sklearn.utils._typing import _format_docstring_annotation +from sklearn.utils._typing import get_annotation_class_name +from sklearn.utils._typing import format_docstring_annotation from sklearn.utils._typing import get_docstring_annotations @pytest.mark.parametrize("annotation, expected_class", [ (None, 'None'), - (Any, 'Any'), (str, 'str'), (int, 'int'), (float, 'float'), (list, 'list'), - (BaseEstimator, 'BaseEstimator'), + (EstimatorType, 'TypeVar'), (List[int], 'List'), (Union[int, float], 'Union'), (Dict, 'Dict'), @@ -31,12 +29,13 @@ (Callable[[str], str], 'Callable'), ]) def test_get_annotation_class_name(annotation, expected_class): - assert _get_annotation_class_name(annotation) == expected_class + """Test annotation names are returned correct.""" + assert get_annotation_class_name(annotation) == expected_class @pytest.mark.parametrize("annotation, expected_str", [ (None, "None"), - (BaseEstimator, "estimator instance"), + (EstimatorType, "estimator instance"), (np.random.RandomState, "RandomState instance"), (int, "int"), (float, 'float'), @@ -44,30 +43,31 @@ def test_get_annotation_class_name(annotation, expected_class): (str, "str"), (List[int], "list of int"), (Optional[List[int]], "list of int or None"), - (List[BaseEstimator], "list of estimator instance"), - (Optional[BaseEstimator], "estimator instance or None"), + (List[EstimatorType], "list of estimator instance"), + (Optional[EstimatorType], "estimator instance or None"), (Union[int, float], "int or float"), - (RandomState, "int, RandomState instance or None") + (RandomStateType, "int, RandomState instance or None") ]) def test_format_docstring_annotation(annotation, expected_str): - assert _format_docstring_annotation(annotation) == expected_str + """Check format for auto generation annotations.""" + assert format_docstring_annotation(annotation) == expected_str -class TestObject: +class _TypingObject: def __init__(self, - estimator: BaseEstimator, + estimator: EstimatorType, num: int = 10, union_num: Union[int, float] = 1.4, float_num: float = 1e-4, - pet: "Literal['dog']" = 'dog', - weather: "Literal['sunny', 'cloudy']" = 'sunny', - random_state: RandomState = None): + pet: Literal['dog'] = 'dog', + weather: Literal['sunny', 'cloudy'] = 'sunny', + random_state: RandomStateType = None): pass def test_get_docstring_annotations(): - # get_docstring_annotations needs typing_extensions for Literal + """Check docstring for annotations.""" pytest.importorskip("typing_extensions") - annotations = get_docstring_annotations(TestObject.__init__) + annotations = get_docstring_annotations(_TypingObject.__init__) assert annotations['estimator'] == "estimator instance" assert annotations['num'] == "int, default=10" From 459c0bf1dfc190d3f41cb354f958a172c0abeff1 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 Jan 2021 20:18:01 -0500 Subject: [PATCH 35/38] ENH Better diffs --- build_tools/azure/install.cmd | 4 ++-- setup.cfg | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/build_tools/azure/install.cmd b/build_tools/azure/install.cmd index 5046bcd802300..caf28261dcc9f 100644 --- a/build_tools/azure/install.cmd +++ b/build_tools/azure/install.cmd @@ -15,7 +15,7 @@ IF "%PYTHON_ARCH%"=="64" ( call activate %VIRTUALENV% - pip install threadpoolctl typing-extensions + pip install threadpoolctl IF "%PYTEST_VERSION%"=="*" ( pip install pytest @@ -23,7 +23,7 @@ IF "%PYTHON_ARCH%"=="64" ( pip install pytest==%PYTEST_VERSION% ) ) else ( - pip install numpy scipy cython pytest wheel pillow joblib threadpoolctl typing-extensions + pip install numpy scipy cython pytest wheel pillow joblib threadpoolctl ) IF "%PYTEST_XDIST%" == "true" ( diff --git a/setup.cfg b/setup.cfg index 23eb66feca6e1..1ec80252f08f3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,9 +28,6 @@ artifact_indexes= [flake8] # Default flake8 3.5 ignored flags ignore=E121,E123,E126,E226,E24,E704,W503,W504 -[mypy-sklearn.externals.*] -check_untyped_defs=False -ignore_errors=True # It's fine not to put the import at the top of the file in the examples # folder. @@ -41,6 +38,10 @@ per-file-ignores = ignore_missing_imports = True allow_redefinition = True +[mypy-sklearn.externals.*] +check_untyped_defs=False +ignore_errors=True + [check-manifest] # ignore files missing in VCS ignore = From 39d3fea5269afdf40e8a5ee460b61c38091af584 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 Jan 2021 20:22:11 -0500 Subject: [PATCH 36/38] ENH Simplify to only hyper-parameters --- setup.cfg | 1 - sklearn/linear_model/_logistic.py | 2 +- sklearn/tests/test_docstring_types.py | 21 ++++++++------------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/setup.cfg b/setup.cfg index 1ec80252f08f3..f3e2a49f367ad 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,7 +28,6 @@ artifact_indexes= [flake8] # Default flake8 3.5 ignored flags ignore=E121,E123,E126,E226,E24,E704,W503,W504 - # It's fine not to put the import at the top of the file in the examples # folder. per-file-ignores = diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index 53e9a21fa0393..f85538db810a7 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -1100,7 +1100,7 @@ class LogisticRegression(LinearClassifierMixin, data. See :term:`Glossary ` for details. solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, \ - default='lbfgs' + default='lbfgs' Algorithm to use in the optimization problem. diff --git a/sklearn/tests/test_docstring_types.py b/sklearn/tests/test_docstring_types.py index d97f2f3e7e019..0e41a6da898fe 100644 --- a/sklearn/tests/test_docstring_types.py +++ b/sklearn/tests/test_docstring_types.py @@ -59,6 +59,8 @@ "GraphicalLasso", "GraphicalLassoCV", "GridSearchCV", + "HalvingGridSearchCV", + "HalvingRandomSearchCV", "HashingVectorizer", "HistGradientBoostingClassifier", "HistGradientBoostingRegressor", @@ -167,6 +169,7 @@ "RidgeClassifier", "RidgeClassifierCV", "RobustScaler", + "SelfTrainingClassifier", "SGDClassifier", "SGDRegressor", "SVC", @@ -188,6 +191,7 @@ "SpectralClustering", "SpectralCoclustering", "SpectralEmbedding", + "SplineTransformer", "StackingClassifier", "StackingRegressor", "StandardScaler", @@ -218,26 +222,17 @@ def test_estimators_typestring(name, Estimator): doc = docscrape.ClassDoc(Estimator) parameters = doc['Parameters'] parameter_annnotations = get_docstring_annotations(Estimator.__init__) - _check_annotations(parameters, parameter_annnotations) + assert len(parameters) == len(parameter_annnotations) - attributes = doc['Attributes'] - attribute_annotations = get_docstring_annotations(Estimator) - _check_annotations(attributes, attribute_annotations) - - -def _check_annotations(docstring_items, expected_annotations): - - assert len(docstring_items) == len(expected_annotations) - - for item in docstring_items: + for item in parameters: name, type_str = item.name, item.type # skip annotations with "shape of" for now, this can be added when # we support Annotated - if "of shape" in type_str: + if "shape of" in type_str: continue # whitespaces are collapsed to one whitespace type_str = ' '.join(item.type.split()) - assert type_str.startswith(expected_annotations[name]), ( + assert type_str.startswith(parameter_annnotations[name]), ( f"{name} has incorrectly formated docstring") From b43dd9c6c113f2b50fc3a332bac0506fa49d936f Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Sun, 24 Jan 2021 20:30:21 -0500 Subject: [PATCH 37/38] ENH Adds cvsplit --- sklearn/utils/_typing.py | 19 +++++++++++++++++++ sklearn/utils/tests/test_typing.py | 5 ++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py index a281d81ebfb70..c5fe8fdd46fbd 100644 --- a/sklearn/utils/_typing.py +++ b/sklearn/utils/_typing.py @@ -3,6 +3,7 @@ import typing from typing import Union from typing import TypeVar +from typing import Iterator import numpy as np @@ -26,6 +27,20 @@ def __getitem__(self, values): Literal = _SimpleLiteral() +if typing.TYPE_CHECKING or TYPING_EXTENSION_INSTALLED: + from typing_extensions import Protocol # noqa + + class CVSplitter(Protocol): + def get_n_splits(self): + """Get the number of splits.""" + + def split(self, X, y=None, groups=None): + """Split data""" + +else: + CVSplitter = TypeVar("CVSplitter") # typing: ignore + +CVType = Union[int, CVSplitter, Iterator, None] ArrayLike = TypeVar("ArrayLike") NDArray = TypeVar("NDArray") EstimatorType = TypeVar("EstimatorType") @@ -70,6 +85,10 @@ def format_docstring_annotation(annotation): # handle some annotations directly if annotation == np.random.RandomState: return "RandomState instance" + elif annotation == CVSplitter: + return "cross-validation generator" + elif annotation == Iterator: + return "iterable" class_name = get_annotation_class_name(annotation) diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py index 3ae9a8482ae8e..571a44de3a658 100644 --- a/sklearn/utils/tests/test_typing.py +++ b/sklearn/utils/tests/test_typing.py @@ -13,6 +13,7 @@ from sklearn.utils._typing import get_annotation_class_name from sklearn.utils._typing import format_docstring_annotation from sklearn.utils._typing import get_docstring_annotations +from sklearn.utils._typing import CVType @pytest.mark.parametrize("annotation, expected_class", [ @@ -26,6 +27,7 @@ (Union[int, float], 'Union'), (Dict, 'Dict'), (Callable, 'Callable'), + (CVType, 'Union'), (Callable[[str], str], 'Callable'), ]) def test_get_annotation_class_name(annotation, expected_class): @@ -46,7 +48,8 @@ def test_get_annotation_class_name(annotation, expected_class): (List[EstimatorType], "list of estimator instance"), (Optional[EstimatorType], "estimator instance or None"), (Union[int, float], "int or float"), - (RandomStateType, "int, RandomState instance or None") + (RandomStateType, "int, RandomState instance or None"), + (CVType, "int, cross-validation generator, iterable or None") ]) def test_format_docstring_annotation(annotation, expected_str): """Check format for auto generation annotations.""" From 09cad30e9185b4e6c2255e826e05ca93871c5a48 Mon Sep 17 00:00:00 2001 From: "Thomas J. Fan" Date: Fri, 11 Jun 2021 09:44:22 -0400 Subject: [PATCH 38/38] CLN Smaller diff --- build_tools/azure/install.sh | 3 +- build_tools/get_formatted_docstring_types.py | 29 --- sklearn/_min_dependencies.py | 1 - sklearn/linear_model/_logistic.py | 17 +- sklearn/tests/test_docstring_types.py | 238 ------------------- sklearn/utils/_typing.py | 175 -------------- sklearn/utils/tests/test_typing.py | 82 ------- 7 files changed, 11 insertions(+), 534 deletions(-) delete mode 100644 build_tools/get_formatted_docstring_types.py delete mode 100644 sklearn/tests/test_docstring_types.py delete mode 100644 sklearn/utils/_typing.py delete mode 100644 sklearn/utils/tests/test_typing.py diff --git a/build_tools/azure/install.sh b/build_tools/azure/install.sh index 18e5abcd47972..048ffe300ee2a 100755 --- a/build_tools/azure/install.sh +++ b/build_tools/azure/install.sh @@ -115,8 +115,7 @@ fi python -m pip install $(get_dep threadpoolctl $THREADPOOLCTL_VERSION) \ $(get_dep pytest $PYTEST_VERSION) \ - $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) \ - typing-extensions + $(get_dep pytest-xdist $PYTEST_XDIST_VERSION) if [[ "$COVERAGE" == "true" ]]; then python -m pip install codecov pytest-cov diff --git a/build_tools/get_formatted_docstring_types.py b/build_tools/get_formatted_docstring_types.py deleted file mode 100644 index 204bc2871aaa4..0000000000000 --- a/build_tools/get_formatted_docstring_types.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Helper script to get the docstring type of estimator or function.""" -from importlib import import_module -import argparse -import inspect - -from sklearn.utils.tests.test_typing import get_docstring_annotations - -parser = argparse.ArgumentParser( - description=("Generates typed docstring for a specific scikit-learn " - "class or function")) -parser.add_argument('object', help=("scikit-learn object, for example " - "linear_model.LogisticRegression")) - -args = parser.parse_args() -object_input = args.object -object_split = object_input.split(".") - -module = "sklearn." + ".".join(object_split[:-1]) -obj_str = object_split[-1] -obj = getattr(import_module(module), obj_str) - -print("Parameters") -print("----------") -if inspect.isclass(obj): - formatted_annotations = get_docstring_annotations(obj.__init__) -else: # function - formatted_annotations = get_docstring_annotations(obj) -for name, annotation in formatted_annotations.items(): - print(f"{name} : {annotation}") diff --git a/sklearn/_min_dependencies.py b/sklearn/_min_dependencies.py index 4f7b024aa1a62..6a6ff13c479d1 100644 --- a/sklearn/_min_dependencies.py +++ b/sklearn/_min_dependencies.py @@ -39,7 +39,6 @@ 'sphinx-gallery': ('0.7.0', 'docs'), 'numpydoc': ('1.0.0', 'docs'), 'Pillow': ('7.1.2', 'docs'), - 'typing-extensions': ('3.7.4', 'tests'), 'sphinx-prompt': ('1.3.0', 'docs'), } diff --git a/sklearn/linear_model/_logistic.py b/sklearn/linear_model/_logistic.py index cd452aba703f4..bc46f9bc2e9d3 100644 --- a/sklearn/linear_model/_logistic.py +++ b/sklearn/linear_model/_logistic.py @@ -9,6 +9,10 @@ # Lars Buitinck # Simon Wu # Arthur Mensch ` for details. @@ -1179,7 +1181,7 @@ class LogisticRegression(LinearClassifierMixin, .. versionadded:: 0.17 *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers. - n_jobs : int or None, default=None + n_jobs : int, default=None Number of CPU cores used when parallelizing over classes if multi_class='ovr'". This parameter is ignored when the ``solver`` is set to 'liblinear' regardless of whether 'multi_class' is specified or @@ -1187,7 +1189,7 @@ class LogisticRegression(LinearClassifierMixin, context. ``-1`` means using all processors. See :term:`Glossary ` for more details. - l1_ratio : float or None, default=None + l1_ratio : float, default=None The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent @@ -1197,7 +1199,7 @@ class LogisticRegression(LinearClassifierMixin, Attributes ---------- - classes_ : ndarray of shape (n_classes,) + classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) @@ -1295,7 +1297,7 @@ def __init__(self, fit_intercept: bool = True, intercept_scaling: float = 1, class_weight: Union[dict, Literal["balanced"]] = None, - random_state: RandomStateType = None, + random_state: Union[int, np.random.RandomState, None] = None, solver: Literal["newton-cg", "lbfgs", "liblinear", "sag", "saga"] = "lbfgs", max_iter: int = 100, @@ -1304,6 +1306,7 @@ def __init__(self, warm_start: bool = False, n_jobs: Optional[int] = None, l1_ratio: Optional[float] = None): + self.penalty = penalty self.dual = dual self.tol = tol diff --git a/sklearn/tests/test_docstring_types.py b/sklearn/tests/test_docstring_types.py deleted file mode 100644 index 0e41a6da898fe..0000000000000 --- a/sklearn/tests/test_docstring_types.py +++ /dev/null @@ -1,238 +0,0 @@ -import pytest - -from sklearn.utils import all_estimators -from sklearn.utils._typing import get_docstring_annotations - - -TYPING_IGNORED = { - "ARDRegression", - "AdaBoostClassifier", - "AdaBoostRegressor", - "AdditiveChi2Sampler", - "AffinityPropagation", - "AgglomerativeClustering", - "BaggingClassifier", - "BaggingRegressor", - "BayesianGaussianMixture", - "BayesianRidge", - "BernoulliNB", - "BernoulliRBM", - "Binarizer", - "Birch", - "CCA", - "CalibratedClassifierCV", - "CategoricalNB", - "ClassifierChain", - "ColumnTransformer", - "ComplementNB", - "CountVectorizer", - "DBSCAN", - "DecisionTreeClassifier", - "DecisionTreeRegressor", - "DictVectorizer", - "DictionaryLearning", - "DummyClassifier", - "DummyRegressor", - "ElasticNet", - "ElasticNetCV", - "EllipticEnvelope", - "EmpiricalCovariance", - "ExtraTreeClassifier", - "ExtraTreeRegressor", - "ExtraTreesClassifier", - "ExtraTreesRegressor", - "FactorAnalysis", - "FastICA", - "FeatureAgglomeration", - "FeatureHasher", - "FeatureUnion", - "FunctionTransformer", - "GammaRegressor", - "GaussianMixture", - "GaussianNB", - "GaussianProcessClassifier", - "GaussianProcessRegressor", - "GaussianRandomProjection", - "GenericUnivariateSelect", - "GradientBoostingClassifier", - "GradientBoostingRegressor", - "GraphicalLasso", - "GraphicalLassoCV", - "GridSearchCV", - "HalvingGridSearchCV", - "HalvingRandomSearchCV", - "HashingVectorizer", - "HistGradientBoostingClassifier", - "HistGradientBoostingRegressor", - "HuberRegressor", - "IncrementalPCA", - "IsolationForest", - "Isomap", - "IsotonicRegression", - "IterativeImputer", - "KBinsDiscretizer", - "KMeans", - "KNNImputer", - "KNeighborsClassifier", - "KNeighborsRegressor", - "KNeighborsTransformer", - "KernelCenterer", - "KernelDensity", - "KernelPCA", - "KernelRidge", - "LabelBinarizer", - "LabelEncoder", - "LabelPropagation", - "LabelSpreading", - "Lars", - "LarsCV", - "Lasso", - "LassoCV", - "LassoLars", - "LassoLarsCV", - "LassoLarsIC", - "LatentDirichletAllocation", - "LedoitWolf", - "LinearDiscriminantAnalysis", - "LinearRegression", - "LinearSVC", - "LinearSVR", - "LocalOutlierFactor", - "LocallyLinearEmbedding", - "LogisticRegressionCV", - "MDS", - "MLPClassifier", - "MLPRegressor", - "MaxAbsScaler", - "MeanShift", - "MinCovDet", - "MinMaxScaler", - "MiniBatchDictionaryLearning", - "MiniBatchKMeans", - "MiniBatchSparsePCA", - "MissingIndicator", - "MultiLabelBinarizer", - "MultiOutputClassifier", - "MultiOutputRegressor", - "MultiTaskElasticNet", - "MultiTaskElasticNetCV", - "MultiTaskLasso", - "MultiTaskLassoCV", - "MultinomialNB", - "NMF", - "NearestCentroid", - "NearestNeighbors", - "NeighborhoodComponentsAnalysis", - "Normalizer", - "NuSVC", - "NuSVR", - "Nystroem", - "OAS", - "OPTICS", - "OneClassSVM", - "OneHotEncoder", - "OneVsOneClassifier", - "OneVsRestClassifier", - "OrdinalEncoder", - "OrthogonalMatchingPursuit", - "OrthogonalMatchingPursuitCV", - "OutputCodeClassifier", - "PolynomialCountSketch", - "PCA", - "PLSCanonical", - "PLSRegression", - "PLSSVD", - "PassiveAggressiveClassifier", - "PassiveAggressiveRegressor", - "PatchExtractor", - "Perceptron", - "Pipeline", - "PoissonRegressor", - "PolynomialFeatures", - "PowerTransformer", - "QuadraticDiscriminantAnalysis", - "QuantileTransformer", - "RANSACRegressor", - "RBFSampler", - "RFE", - "RFECV", - "RadiusNeighborsClassifier", - "RadiusNeighborsRegressor", - "RadiusNeighborsTransformer", - "RandomForestClassifier", - "RandomForestRegressor", - "RandomTreesEmbedding", - "RandomizedSearchCV", - "RegressorChain", - "Ridge", - "RidgeCV", - "RidgeClassifier", - "RidgeClassifierCV", - "RobustScaler", - "SelfTrainingClassifier", - "SGDClassifier", - "SGDRegressor", - "SVC", - "SVR", - "SelectFdr", - "SelectFpr", - "SelectFromModel", - "SelectFwe", - "SelectKBest", - "SelectPercentile", - "SequentialFeatureSelector", - "ShrunkCovariance", - "SimpleImputer", - "SkewedChi2Sampler", - "SparseCoder", - "SparsePCA", - "SparseRandomProjection", - "SpectralBiclustering", - "SpectralClustering", - "SpectralCoclustering", - "SpectralEmbedding", - "SplineTransformer", - "StackingClassifier", - "StackingRegressor", - "StandardScaler", - "TSNE", - "TfidfTransformer", - "TfidfVectorizer", - "TheilSenRegressor", - "TransformedTargetRegressor", - "TruncatedSVD", - "TweedieRegressor", - "VarianceThreshold", - "VotingClassifier", - "VotingRegressor", -} - - -@pytest.mark.parametrize( - 'name, Estimator', [ - pytest.param( - name, Estimator, marks=pytest.mark.skipif( - name in TYPING_IGNORED, - reason="Estimator does not have annotations")) - for name, Estimator in all_estimators()]) -def test_estimators_typestring(name, Estimator): - # Check that docstring's type is formated correctly - docscrape = pytest.importorskip('numpydoc.docscrape') - - doc = docscrape.ClassDoc(Estimator) - parameters = doc['Parameters'] - parameter_annnotations = get_docstring_annotations(Estimator.__init__) - assert len(parameters) == len(parameter_annnotations) - - for item in parameters: - name, type_str = item.name, item.type - - # skip annotations with "shape of" for now, this can be added when - # we support Annotated - if "shape of" in type_str: - continue - - # whitespaces are collapsed to one whitespace - type_str = ' '.join(item.type.split()) - assert type_str.startswith(parameter_annnotations[name]), ( - f"{name} has incorrectly formated docstring") diff --git a/sklearn/utils/_typing.py b/sklearn/utils/_typing.py deleted file mode 100644 index c5fe8fdd46fbd..0000000000000 --- a/sklearn/utils/_typing.py +++ /dev/null @@ -1,175 +0,0 @@ -import inspect -import numbers -import typing -from typing import Union -from typing import TypeVar -from typing import Iterator - -import numpy as np - - -try: - import typing_extensions # noqa - - TYPING_EXTENSION_INSTALLED = True -except ImportError: - TYPING_EXTENSION_INSTALLED = False - - -if typing.TYPE_CHECKING or TYPING_EXTENSION_INSTALLED: - from typing_extensions import Literal # noqa -else: - - class _SimpleLiteral: - def __getitem__(self, values): - return typing.Any - - Literal = _SimpleLiteral() - - -if typing.TYPE_CHECKING or TYPING_EXTENSION_INSTALLED: - from typing_extensions import Protocol # noqa - - class CVSplitter(Protocol): - def get_n_splits(self): - """Get the number of splits.""" - - def split(self, X, y=None, groups=None): - """Split data""" - -else: - CVSplitter = TypeVar("CVSplitter") # typing: ignore - -CVType = Union[int, CVSplitter, Iterator, None] -ArrayLike = TypeVar("ArrayLike") -NDArray = TypeVar("NDArray") -EstimatorType = TypeVar("EstimatorType") -JoblibMemory = TypeVar("JoblibMemory") -RandomStateType = Union[int, np.random.RandomState, None] -MemoryType = Union[str, JoblibMemory, None] - - -def get_annotation_class_name(annotation) -> str: - # Special cases - if annotation is None or annotation is type(None): # noqa - return "None" - - if getattr(annotation, "__qualname__", None): - return annotation.__qualname__ - elif getattr( - annotation, "_name", None - ): # Required for generic aliases on Python 3.7+ - return annotation._name - - origin = getattr(annotation, "__origin__", None) - if origin: - if getattr(origin, "__qualname__", None): - # Required for Protocol subclasses - return origin.__qualname__ - elif getattr(origin, "_name", None): - # Required for Union on Python 3.7+ - return origin._name - else: - return origin.__class__.__qualname__.lstrip( - "_" - ) # Required for Union on Python < 3.7 - - annotation_cls = (annotation - if inspect.isclass(annotation) else annotation.__class__) - return annotation_cls.__qualname__.lstrip("_") - - -def format_docstring_annotation(annotation): - """Convert annotation to docstring.""" - - # handle some annotations directly - if annotation == np.random.RandomState: - return "RandomState instance" - elif annotation == CVSplitter: - return "cross-validation generator" - elif annotation == Iterator: - return "iterable" - - class_name = get_annotation_class_name(annotation) - - if class_name == "None": - return "None" - elif class_name == "TypeVar": - name = annotation.__name__ - if name == "EstimatorType": - return "estimator instance" - elif name == "ArrayLike": - return "array-like" - elif name == "NDArray": - return "ndarray" - elif name == "JoblibMemory": - return "object with the joblib.Memory interface" - else: - raise ValueError(f"Unrecognized TypeVar: {annotation}") - - elif class_name == 'Union': - values = [format_docstring_annotation(t) - for t in annotation.__args__] - if len(values) == 2: - return ' or '.join(values) - first = ', '.join(values[:-1]) - return f'{first} or {values[-1]}' - - elif class_name == 'Literal': - if hasattr(annotation, '__values__'): - # For Python == 3.6 support - args = annotation.__values__ - else: - args = annotation.__args__ - items = [repr(t) for t in args] - if len(items) == 1: - return items[0] - values = ', '.join(items) - return f'{{{values}}}' - - elif class_name == 'List': - values = ', '.join(format_docstring_annotation(t) - for t in annotation.__args__) - return f'list of {values}' - - return class_name - - -def get_docstring_annotations(obj): - """Get human readable docstring for types for a obj with annotations. - - This function requires `typing_extensions` to be installed to run. - - Parameters - ---------- - obj: object - Object to get annotations from - - Returns - ------- - output: dict - dictionary mapping from name to human-readable docstring. - """ - if not hasattr(obj, '__annotations__'): - return {} - - annotations = typing.get_type_hints(obj) - # get defaults - params = inspect.signature(obj).parameters - defaults = {p: v.default for p, v in params.items() - if v.default != inspect.Parameter.empty} - - output = {} - for name, annotation in annotations.items(): - anno = format_docstring_annotation(annotation) - if name in defaults: - default = defaults[name] - if (isinstance(default, numbers.Real) and - not isinstance(default, numbers.Integral)): - # For floats the representation can vary, i.e: - # default=np.inf or default=1e-4 - anno += ", default=" - else: - anno += f", default={repr(default)}" - output[name] = anno - return output diff --git a/sklearn/utils/tests/test_typing.py b/sklearn/utils/tests/test_typing.py deleted file mode 100644 index 571a44de3a658..0000000000000 --- a/sklearn/utils/tests/test_typing.py +++ /dev/null @@ -1,82 +0,0 @@ -from typing import Dict -from typing import List -from typing import Union -from typing import Callable -from typing import Optional - -import pytest -import numpy as np - -from sklearn.utils._typing import RandomStateType -from sklearn.utils._typing import EstimatorType -from sklearn.utils._typing import Literal -from sklearn.utils._typing import get_annotation_class_name -from sklearn.utils._typing import format_docstring_annotation -from sklearn.utils._typing import get_docstring_annotations -from sklearn.utils._typing import CVType - - -@pytest.mark.parametrize("annotation, expected_class", [ - (None, 'None'), - (str, 'str'), - (int, 'int'), - (float, 'float'), - (list, 'list'), - (EstimatorType, 'TypeVar'), - (List[int], 'List'), - (Union[int, float], 'Union'), - (Dict, 'Dict'), - (Callable, 'Callable'), - (CVType, 'Union'), - (Callable[[str], str], 'Callable'), -]) -def test_get_annotation_class_name(annotation, expected_class): - """Test annotation names are returned correct.""" - assert get_annotation_class_name(annotation) == expected_class - - -@pytest.mark.parametrize("annotation, expected_str", [ - (None, "None"), - (EstimatorType, "estimator instance"), - (np.random.RandomState, "RandomState instance"), - (int, "int"), - (float, 'float'), - (list, "list"), - (str, "str"), - (List[int], "list of int"), - (Optional[List[int]], "list of int or None"), - (List[EstimatorType], "list of estimator instance"), - (Optional[EstimatorType], "estimator instance or None"), - (Union[int, float], "int or float"), - (RandomStateType, "int, RandomState instance or None"), - (CVType, "int, cross-validation generator, iterable or None") -]) -def test_format_docstring_annotation(annotation, expected_str): - """Check format for auto generation annotations.""" - assert format_docstring_annotation(annotation) == expected_str - - -class _TypingObject: - def __init__(self, - estimator: EstimatorType, - num: int = 10, union_num: Union[int, float] = 1.4, - float_num: float = 1e-4, - pet: Literal['dog'] = 'dog', - weather: Literal['sunny', 'cloudy'] = 'sunny', - random_state: RandomStateType = None): - pass - - -def test_get_docstring_annotations(): - """Check docstring for annotations.""" - pytest.importorskip("typing_extensions") - annotations = get_docstring_annotations(_TypingObject.__init__) - - assert annotations['estimator'] == "estimator instance" - assert annotations['num'] == "int, default=10" - assert annotations['float_num'] == "float, default=" - assert annotations['union_num'] == "int or float, default=" - assert annotations['pet'] == "'dog', default='dog'" - assert annotations['weather'] == "{'sunny', 'cloudy'}, default='sunny'" - assert annotations['random_state'] == ("int, RandomState instance or None" - ", default=None")