From d8f4c29d7db66b90610093187b639f217e65c398 Mon Sep 17 00:00:00 2001 From: Frans Larsson Date: Wed, 19 Aug 2020 23:12:31 +0200 Subject: [PATCH 1/6] DOC Fix doc of defaults in grower.py --- .../_hist_gradient_boosting/grower.py | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index e0b54550d3082..feb35825a0f2e 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -1,7 +1,7 @@ """ This module contains the TreeGrower class. -TreeGrowee builds a regression tree fitting a Newton-Raphson step, based on +TreeGrower builds a regression tree fitting a Newton-Raphson step, based on the gradients and hessians of the training data. """ # Author: Nicolas Hug @@ -33,20 +33,20 @@ class TreeNode: ---------- depth : int The depth of the node, i.e. its distance from the root. - sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) + sample_indices : ndarray of unsigned int of shape (n_samples_at_node,) The indices of the samples at the node. sum_gradients : float The sum of the gradients of the samples at the node. sum_hessians : float The sum of the hessians of the samples at the node. - parent : TreeNode or None, optional (default=None) + parent : TreeNode, default=None The parent of the node. None for root. Attributes ---------- depth : int The depth of the node, i.e. its distance from the root. - sample_indices : ndarray of unsigned int, shape (n_samples_at_node,) + sample_indices : ndarray of unsigned int of shape (n_samples_at_node,) The indices of the samples at the node. sum_gradients : float The sum of the gradients of the samples at the node. @@ -134,45 +134,50 @@ class TreeGrower: Parameters ---------- - X_binned : ndarray of int, shape (n_samples, n_features) + X_binned : ndarray of int of shape (n_samples, n_features) The binned input samples. Must be Fortran-aligned. - gradients : ndarray, shape (n_samples,) + gradients : ndarray of shape (n_samples,) The gradients of each training sample. Those are the gradients of the loss w.r.t the predictions, evaluated at iteration ``i - 1``. - hessians : ndarray, shape (n_samples,) + hessians : ndarray of shape (n_samples,) The hessians of each training sample. Those are the hessians of the loss w.r.t the predictions, evaluated at iteration ``i - 1``. - max_leaf_nodes : int or None, optional (default=None) + max_leaf_nodes : int, default=None The maximum number of leaves for each tree. If None, there is no maximum limit. - max_depth : int or None, optional (default=None) + max_depth : int, default=None The maximum depth of each tree. The depth of a tree is the number of edges to go from the root to the deepest leaf. Depth isn't constrained by default. - min_samples_leaf : int, optional (default=20) + min_samples_leaf : int, default=20 The minimum number of samples per leaf. - min_gain_to_split : float, optional (default=0.) + min_gain_to_split : float, default=0. The minimum gain needed to split a node. Splits with lower gain will be ignored. - n_bins : int, optional (default=256) + n_bins : int, default=256 The total number of bins, including the bin for missing values. Used to define the shape of the histograms. - n_bins_non_missing_ : array of uint32 + n_bins_non_missing : ndarray, dtype=np.uint32, default=None For each feature, gives the number of bins actually used for non-missing values. For features with a lot of unique values, this is equal to ``n_bins - 1``. If it's an int, all features are considered to have the same number of bins. If None, all features are considered to have ``n_bins - 1`` bins. - has_missing_values : ndarray of bool or bool, optional (default=False) + has_missing_values : bool or ndarray of bool, default=False Whether each feature contains missing values (in the training data). If it's a bool, the same value is used for all features. - l2_regularization : float, optional (default=0) + monotonic_cst : array-like of int of shape (n_features), default=None + Indicates the monotonic constraint to enforce on each feature. -1, 1 + and 0 respectively correspond to a positive constraint, negative + constraint and no constraint. Read more in the :ref:`User Guide + `. + l2_regularization : float, default=0. The L2 regularization parameter. - min_hessian_to_split : float, optional (default=1e-3) + min_hessian_to_split : float, default=1e-3 The minimum sum of hessians needed in each node. Splits that result in at least one child having a sum of hessians less than ``min_hessian_to_split`` are discarded. - shrinkage : float, optional (default=1) + shrinkage : float, default=1. The shrinkage parameter to apply to the leaves values, also known as learning rate. """ @@ -508,7 +513,7 @@ def make_predictor(self, bin_thresholds=None): Parameters ---------- - bin_thresholds : array-like of floats, optional (default=None) + bin_thresholds : array-like of floats, default=None The actual thresholds values of each bin. Returns From baace64ad0adfc950716bfbd52dba586509782c6 Mon Sep 17 00:00:00 2001 From: Frans Larsson Date: Thu, 20 Aug 2020 20:39:03 +0200 Subject: [PATCH 2/6] Add dtype for bool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Juan Carlos Alfaro Jiménez --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index feb35825a0f2e..a2eb766bdd6cd 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -163,7 +163,7 @@ class TreeGrower: is equal to ``n_bins - 1``. If it's an int, all features are considered to have the same number of bins. If None, all features are considered to have ``n_bins - 1`` bins. - has_missing_values : bool or ndarray of bool, default=False + has_missing_values : bool or ndarray, dtype=bool, default=False Whether each feature contains missing values (in the training data). If it's a bool, the same value is used for all features. monotonic_cst : array-like of int of shape (n_features), default=None From 705a622d6f387b1b91220679dd7cd99017f11555 Mon Sep 17 00:00:00 2001 From: Frans Larsson Date: Thu, 20 Aug 2020 20:42:05 +0200 Subject: [PATCH 3/6] Add dtype for unsigned int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Juan Carlos Alfaro Jiménez --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index a2eb766bdd6cd..5fe1fefb8d9e3 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -46,7 +46,7 @@ class TreeNode: ---------- depth : int The depth of the node, i.e. its distance from the root. - sample_indices : ndarray of unsigned int of shape (n_samples_at_node,) + sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint The indices of the samples at the node. sum_gradients : float The sum of the gradients of the samples at the node. From c18018c1c03c4b1703cbb32b3caef87f42c34242 Mon Sep 17 00:00:00 2001 From: Frans Larsson Date: Thu, 20 Aug 2020 20:43:10 +0200 Subject: [PATCH 4/6] Add dtype for unsigned int MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Juan Carlos Alfaro Jiménez --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 5fe1fefb8d9e3..12effc2ca3f7f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -33,7 +33,7 @@ class TreeNode: ---------- depth : int The depth of the node, i.e. its distance from the root. - sample_indices : ndarray of unsigned int of shape (n_samples_at_node,) + sample_indices : ndarray of shape (n_samples_at_node,), dtype=np.uint The indices of the samples at the node. sum_gradients : float The sum of the gradients of the samples at the node. From 2f0f1362eee1e3f1dc80660474d198fe72141ec4 Mon Sep 17 00:00:00 2001 From: Frans Larsson Date: Tue, 25 Aug 2020 21:44:59 +0200 Subject: [PATCH 5/6] Update dtype to uint8 --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 9ecc4acdbdbd0..1ee7e5f7eea6f 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -134,7 +134,7 @@ class TreeGrower: Parameters ---------- - X_binned : ndarray of int of shape (n_samples, n_features) + X_binned : ndarray of shape (n_samples, n_features), dtype=np.uint8 The binned input samples. Must be Fortran-aligned. gradients : ndarray of shape (n_samples,) The gradients of each training sample. Those are the gradients of the From c76d672ed758d45f7c74a7cb28a0ca9e35565bd6 Mon Sep 17 00:00:00 2001 From: Frans Larsson Date: Tue, 25 Aug 2020 21:48:15 +0200 Subject: [PATCH 6/6] Minor update to docstring Co-authored-by: Guillaume Lemaitre --- sklearn/ensemble/_hist_gradient_boosting/grower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklearn/ensemble/_hist_gradient_boosting/grower.py b/sklearn/ensemble/_hist_gradient_boosting/grower.py index 1ee7e5f7eea6f..58b0c3020e548 100644 --- a/sklearn/ensemble/_hist_gradient_boosting/grower.py +++ b/sklearn/ensemble/_hist_gradient_boosting/grower.py @@ -166,7 +166,7 @@ class TreeGrower: has_missing_values : bool or ndarray, dtype=bool, default=False Whether each feature contains missing values (in the training data). If it's a bool, the same value is used for all features. - monotonic_cst : array-like of int of shape (n_features), default=None + monotonic_cst : array-like of shape (n_features,), dtype=int, default=None Indicates the monotonic constraint to enforce on each feature. -1, 1 and 0 respectively correspond to a positive constraint, negative constraint and no constraint. Read more in the :ref:`User Guide