DOC update and improve the `sample_weight` entry in the glossary #30564

ogrisel · 2024-12-31T17:00:27Z

Link to practical usage examples.
Removed some FIXMEs and TODOs.
Fixed the description of the min_samples in DBSCAN and contrasted it with the min_sample_leaf / min_weight_fraction_in_leaf parameters pair.
Refine the description of the interplay with the class_weight parameter.
Reference ongoing work and the tracking meta-issue.

github-actions · 2024-12-31T17:01:41Z

❌ Linting issues

This PR is introducing linting issues. Here's a summary of the issues. Note that you can avoid having linting issues by enabling pre-commit hooks. Instructions to enable them can be found here.

You can see the details of the linting issues under the lint job here

`ruff format`

ruff detected issues. Please run ruff format locally and push the changes. Here you can see the detected issues. Note that the installed ruff version is ruff=0.5.1.


--- build_tools/circle/list_versions.py
+++ build_tools/circle/list_versions.py
@@ -71,9 +71,7 @@
     "Web-based documentation is available for versions listed below:\n",
 ]
 
-ROOT_URL = (
-    "https://api.github.com/repos/scikit-learn/scikit-learn.github.io/contents/"  # noqa
-)
+ROOT_URL = "https://api.github.com/repos/scikit-learn/scikit-learn.github.io/contents/"  # noqa
 RAW_FMT = "https://raw.githubusercontent.com/scikit-learn/scikit-learn.github.io/master/%s/index.html"  # noqa
 VERSION_RE = re.compile(r"scikit-learn ([\w\.\-]+) documentation</title>")
 NAMED_DIRS = ["dev", "stable"]

--- examples/applications/plot_species_distribution_modeling.py
+++ examples/applications/plot_species_distribution_modeling.py
@@ -109,7 +109,7 @@
 
 
 def plot_species_distribution(
-    species=("bradypus_variegatus_0", "microryzomys_minutus_0")
+    species=("bradypus_variegatus_0", "microryzomys_minutus_0"),
 ):
     """
     Plot the species distribution.

--- examples/ensemble/plot_bias_variance.py
+++ examples/ensemble/plot_bias_variance.py
@@ -177,8 +177,8 @@
 
     plt.subplot(2, n_estimators, n_estimators + n + 1)
     plt.plot(X_test, y_error, "r", label="$error(x)$")
-    plt.plot(X_test, y_bias, "b", label="$bias^2(x)$"),
-    plt.plot(X_test, y_var, "g", label="$variance(x)$"),
+    (plt.plot(X_test, y_bias, "b", label="$bias^2(x)$"),)
+    (plt.plot(X_test, y_var, "g", label="$variance(x)$"),)
     plt.plot(X_test, y_noise, "c", label="$noise(x)$")
 
     plt.xlim([-5, 5])

--- examples/linear_model/plot_tweedie_regression_insurance_claims.py
+++ examples/linear_model/plot_tweedie_regression_insurance_claims.py
@@ -604,8 +604,9 @@
             "predicted, frequency*severity model": np.sum(
                 exposure * glm_freq.predict(X) * glm_sev.predict(X)
             ),
-            "predicted, tweedie, power=%.2f"
-            % glm_pure_premium.power: np.sum(exposure * glm_pure_premium.predict(X)),
+            "predicted, tweedie, power=%.2f" % glm_pure_premium.power: np.sum(
+                exposure * glm_pure_premium.predict(X)
+            ),
         }
     )
 

--- examples/manifold/plot_lle_digits.py
+++ examples/manifold/plot_lle_digits.py
@@ -10,7 +10,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 # %%
 # Load digits dataset
 # -------------------

--- examples/manifold/plot_manifold_sphere.py
+++ examples/manifold/plot_manifold_sphere.py
@@ -50,7 +50,7 @@
 t = random_state.rand(n_samples) * np.pi
 
 # Sever the poles from the sphere.
-indices = (t < (np.pi - (np.pi / 8))) & (t > ((np.pi / 8)))
+indices = (t < (np.pi - (np.pi / 8))) & (t > (np.pi / 8))
 colors = p[indices]
 x, y, z = (
     np.sin(t[indices]) * np.cos(p[indices]),

--- sklearn/_loss/tests/test_loss.py
+++ sklearn/_loss/tests/test_loss.py
@@ -215,7 +215,8 @@
 
 
 @pytest.mark.parametrize(
-    "loss, y_pred_success, y_pred_fail", Y_COMMON_PARAMS + Y_PRED_PARAMS  # type: ignore
+    "loss, y_pred_success, y_pred_fail",
+    Y_COMMON_PARAMS + Y_PRED_PARAMS,  # type: ignore
 )
 def test_loss_boundary_y_pred(loss, y_pred_success, y_pred_fail):
     """Test boundaries of y_pred for loss functions."""
@@ -501,12 +502,14 @@
         sample_weight=sample_weight,
         loss_out=out_l1,
     )
-    loss.closs.loss(
-        y_true=y_true,
-        raw_prediction=raw_prediction,
-        sample_weight=sample_weight,
-        loss_out=out_l2,
-    ),
+    (
+        loss.closs.loss(
+            y_true=y_true,
+            raw_prediction=raw_prediction,
+            sample_weight=sample_weight,
+            loss_out=out_l2,
+        ),
+    )
     assert_allclose(out_l1, out_l2)
     loss.gradient(
         y_true=y_true,

--- sklearn/cluster/_feature_agglomeration.py
+++ sklearn/cluster/_feature_agglomeration.py
@@ -6,7 +6,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import numpy as np
 from scipy.sparse import issparse
 

--- sklearn/cross_decomposition/tests/test_pls.py
+++ sklearn/cross_decomposition/tests/test_pls.py
@@ -404,12 +404,12 @@
 
     X_orig = X.copy()
     with pytest.raises(AssertionError):
-        pls.transform(X, Y, copy=False),
+        (pls.transform(X, Y, copy=False),)
         assert_array_almost_equal(X, X_orig)
 
     X_orig = X.copy()
     with pytest.raises(AssertionError):
-        pls.predict(X, copy=False),
+        (pls.predict(X, copy=False),)
         assert_array_almost_equal(X, X_orig)
 
     # Make sure copy=True gives same transform and predictions as predict=False

--- sklearn/ensemble/_bagging.py
+++ sklearn/ensemble/_bagging.py
@@ -3,7 +3,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import itertools
 import numbers
 from abc import ABCMeta, abstractmethod

--- sklearn/ensemble/_forest.py
+++ sklearn/ensemble/_forest.py
@@ -35,7 +35,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import threading
 from abc import ABCMeta, abstractmethod
 from numbers import Integral, Real

--- sklearn/ensemble/tests/test_forest.py
+++ sklearn/ensemble/tests/test_forest.py
@@ -168,11 +168,12 @@
     reg = ForestRegressor(n_estimators=5, criterion=criterion, random_state=1)
     reg.fit(X_reg, y_reg)
     score = reg.score(X_reg, y_reg)
-    assert (
-        score > 0.93
-    ), "Failed with max_features=None, criterion %s and score = %f" % (
-        criterion,
-        score,
+    assert score > 0.93, (
+        "Failed with max_features=None, criterion %s and score = %f"
+        % (
+            criterion,
+            score,
+        )
     )
 
     reg = ForestRegressor(

--- sklearn/experimental/enable_hist_gradient_boosting.py
+++ sklearn/experimental/enable_hist_gradient_boosting.py
@@ -13,7 +13,6 @@
 # Don't remove this file, we don't want to break users code just because the
 # feature isn't experimental anymore.
 
-
 import warnings
 
 warnings.warn(

--- sklearn/feature_selection/_univariate_selection.py
+++ sklearn/feature_selection/_univariate_selection.py
@@ -3,7 +3,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import warnings
 from numbers import Integral, Real
 

--- sklearn/gaussian_process/tests/test_gpc.py
+++ sklearn/gaussian_process/tests/test_gpc.py
@@ -147,8 +147,9 @@
     # Define a dummy optimizer that simply tests 10 random hyperparameters
     def optimizer(obj_func, initial_theta, bounds):
         rng = np.random.RandomState(global_random_seed)
-        theta_opt, func_min = initial_theta, obj_func(
-            initial_theta, eval_gradient=False
+        theta_opt, func_min = (
+            initial_theta,
+            obj_func(initial_theta, eval_gradient=False),
         )
         for _ in range(10):
             theta = np.atleast_1d(

--- sklearn/gaussian_process/tests/test_gpr.py
+++ sklearn/gaussian_process/tests/test_gpr.py
@@ -394,8 +394,9 @@
     # Define a dummy optimizer that simply tests 50 random hyperparameters
     def optimizer(obj_func, initial_theta, bounds):
         rng = np.random.RandomState(0)
-        theta_opt, func_min = initial_theta, obj_func(
-            initial_theta, eval_gradient=False
+        theta_opt, func_min = (
+            initial_theta,
+            obj_func(initial_theta, eval_gradient=False),
         )
         for _ in range(50):
             theta = np.atleast_1d(

--- sklearn/linear_model/_linear_loss.py
+++ sklearn/linear_model/_linear_loss.py
@@ -537,9 +537,9 @@
                 # The L2 penalty enters the Hessian on the diagonal only. To add those
                 # terms, we use a flattened view of the array.
                 order = "C" if hess.flags.c_contiguous else "F"
-                hess.reshape(-1, order=order)[
-                    : (n_features * n_dof) : (n_dof + 1)
-                ] += l2_reg_strength
+                hess.reshape(-1, order=order)[: (n_features * n_dof) : (n_dof + 1)] += (
+                    l2_reg_strength
+                )
 
             if self.fit_intercept:
                 # With intercept included as added column to X, the hessian becomes

--- sklearn/linear_model/_ridge.py
+++ sklearn/linear_model/_ridge.py
@@ -5,7 +5,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import numbers
 import warnings
 from abc import ABCMeta, abstractmethod

--- sklearn/linear_model/_theil_sen.py
+++ sklearn/linear_model/_theil_sen.py
@@ -5,7 +5,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import warnings
 from itertools import combinations
 from numbers import Integral, Real

--- sklearn/manifold/_spectral_embedding.py
+++ sklearn/manifold/_spectral_embedding.py
@@ -3,7 +3,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import warnings
 from numbers import Integral, Real
 

--- sklearn/metrics/_classification.py
+++ sklearn/metrics/_classification.py
@@ -10,7 +10,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import warnings
 from numbers import Integral, Real
 

--- sklearn/metrics/_ranking.py
+++ sklearn/metrics/_ranking.py
@@ -10,7 +10,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import warnings
 from functools import partial
 from numbers import Integral, Real

--- sklearn/metrics/cluster/_supervised.py
+++ sklearn/metrics/cluster/_supervised.py
@@ -7,7 +7,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import warnings
 from math import log
 from numbers import Real

--- sklearn/metrics/cluster/_unsupervised.py
+++ sklearn/metrics/cluster/_unsupervised.py
@@ -3,7 +3,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import functools
 from numbers import Integral
 

--- sklearn/metrics/tests/test_common.py
+++ sklearn/metrics/tests/test_common.py
@@ -976,7 +976,8 @@
 @pytest.mark.parametrize("metric", CLASSIFICATION_METRICS.values())
 @pytest.mark.parametrize(
     "y_true, y_score",
-    invalids_nan_inf +
+    invalids_nan_inf
+    +
     # Add an additional case for classification only
     # non-regression test for:
     # https://github.com/scikit-learn/scikit-learn/issues/6809
@@ -2060,7 +2061,6 @@
 
 
 def check_array_api_metric_pairwise(metric, array_namespace, device, dtype_name):
-
     X_np = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], dtype=dtype_name)
     Y_np = np.array([[0.2, 0.3, 0.4], [0.5, 0.6, 0.7]], dtype=dtype_name)
 

--- sklearn/model_selection/_validation.py
+++ sklearn/model_selection/_validation.py
@@ -6,7 +6,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 import numbers
 import time
 import warnings

--- sklearn/multioutput.py
+++ sklearn/multioutput.py
@@ -8,7 +8,6 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-
 from abc import ABCMeta, abstractmethod
 from numbers import Integral
 

--- sklearn/neighbors/tests/test_neighbors.py
+++ sklearn/neighbors/tests/test_neighbors.py
@@ -655,10 +655,12 @@
             assert_allclose(np.concatenate(list(ind)), np.concatenate(list(ind1)))
 
         for i in range(len(results) - 1):
-            assert_allclose(
-                np.concatenate(list(results[i][0])),
-                np.concatenate(list(results[i + 1][0])),
-            ),
+            (
+                assert_allclose(
+                    np.concatenate(list(results[i][0])),
+                    np.concatenate(list(results[i + 1][0])),
+                ),
+            )
             assert_allclose(
                 np.concatenate(list(results[i][1])),
                 np.concatenate(list(results[i + 1][1])),

--- sklearn/tests/test_common.py
+++ sklearn/tests/test_common.py
@@ -298,7 +298,6 @@
     "transformer", GET_FEATURES_OUT_ESTIMATORS, ids=_get_check_estimator_ids
 )
 def test_transformers_get_feature_names_out(transformer):
-
     with ignore_warnings(category=(FutureWarning)):
         check_transformer_get_feature_names_out(
             transformer.__class__.__name__, transformer

--- sklearn/tests/test_metaestimators.py
+++ sklearn/tests/test_metaestimators.py
@@ -157,11 +157,12 @@
             if method in delegator_data.skip_methods:
                 continue
             assert hasattr(delegate, method)
-            assert hasattr(
-                delegator, method
-            ), "%s does not have method %r when its delegate does" % (
-                delegator_data.name,
-                method,
+            assert hasattr(delegator, method), (
+                "%s does not have method %r when its delegate does"
+                % (
+                    delegator_data.name,
+                    method,
+                )
             )
             # delegation before fit raises a NotFittedError
             if method == "score":
@@ -191,11 +192,12 @@
             delegate = SubEstimator(hidden_method=method)
             delegator = delegator_data.construct(delegate)
             assert not hasattr(delegate, method)
-            assert not hasattr(
-                delegator, method
-            ), "%s has method %r when its delegate does not" % (
-                delegator_data.name,
-                method,
+            assert not hasattr(delegator, method), (
+                "%s has method %r when its delegate does not"
+                % (
+                    delegator_data.name,
+                    method,
+                )
             )
 
 

--- sklearn/utils/_metadata_requests.py
+++ sklearn/utils/_metadata_requests.py
@@ -1098,8 +1098,9 @@
             method_mapping = MethodMapping()
             for method in METHODS:
                 method_mapping.add(caller=method, callee=method)
-            yield "$self_request", RouterMappingPair(
-                mapping=method_mapping, router=self._self_request
+            yield (
+                "$self_request",
+                RouterMappingPair(mapping=method_mapping, router=self._self_request),
             )
         for name, route_mapping in self._route_mappings.items():
             yield (name, route_mapping)

--- sklearn/utils/estimator_checks.py
+++ sklearn/utils/estimator_checks.py
@@ -5287,9 +5287,7 @@
                 'Only binary classification is supported. The type of the target '
                 f'is {{y_type}}.'
         )
-    """.format(
-        name=name
-    )
+    """.format(name=name)
     err_msg = textwrap.dedent(err_msg)
 
     with raises(

--- sklearn/utils/tests/test_multiclass.py
+++ sklearn/utils/tests/test_multiclass.py
@@ -416,12 +416,13 @@
 def test_type_of_target():
     for group, group_examples in EXAMPLES.items():
         for example in group_examples:
-            assert (
-                type_of_target(example) == group
-            ), "type_of_target(%r) should be %r, got %r" % (
-                example,
-                group,
-                type_of_target(example),
+            assert type_of_target(example) == group, (
+                "type_of_target(%r) should be %r, got %r"
+                % (
+                    example,
+                    group,
+                    type_of_target(example),
+                )
             )
 
     for example in NON_ARRAY_LIKE_EXAMPLES:

--- sklearn/utils/tests/test_seq_dataset.py
+++ sklearn/utils/tests/test_seq_dataset.py
@@ -154,30 +154,34 @@
 
 def test_buffer_dtype_mismatch_error():
     with pytest.raises(ValueError, match="Buffer dtype mismatch"):
-        ArrayDataset64(X32, y32, sample_weight32, seed=42),
+        (ArrayDataset64(X32, y32, sample_weight32, seed=42),)
 
     with pytest.raises(ValueError, match="Buffer dtype mismatch"):
-        ArrayDataset32(X64, y64, sample_weight64, seed=42),
+        (ArrayDataset32(X64, y64, sample_weight64, seed=42),)
 
     for csr_container in CSR_CONTAINERS:
         X_csr32 = csr_container(X32)
         X_csr64 = csr_container(X64)
         with pytest.raises(ValueError, match="Buffer dtype mismatch"):
-            CSRDataset64(
-                X_csr32.data,
-                X_csr32.indptr,
-                X_csr32.indices,
-                y32,
-                sample_weight32,
-                seed=42,
-            ),
+            (
+                CSRDataset64(
+                    X_csr32.data,
+                    X_csr32.indptr,
+                    X_csr32.indices,
+                    y32,
+                    sample_weight32,
+                    seed=42,
+                ),
+            )
 
         with pytest.raises(ValueError, match="Buffer dtype mismatch"):
-            CSRDataset32(
-                X_csr64.data,
-                X_csr64.indptr,
-                X_csr64.indices,
-                y64,
-                sample_weight64,
-                seed=42,
-            ),
+            (
+                CSRDataset32(
+                    X_csr64.data,
+                    X_csr64.indptr,
+                    X_csr64.indices,
+                    y64,
+                    sample_weight64,
+                    seed=42,
+                ),
+            )

--- sklearn/utils/tests/test_tags.py
+++ sklearn/utils/tests/test_tags.py
@@ -554,7 +554,6 @@
     assert _to_new_tags(_to_old_tags(new_tags), estimator=estimator) == new_tags
 
     class MyClass:
-
         def fit(self, X, y=None):
             return self  # pragma: no cover
 

--- sklearn/utils/validation.py
+++ sklearn/utils/validation.py
@@ -1547,8 +1547,7 @@
         # hasattr(estimator, "fit") makes it so that we don't fail for an estimator
         # that does not have a `fit` method during collection of checks. The right
         # checks will fail later.
-        hasattr(estimator, "fit")
-        and parameter in signature(estimator.fit).parameters
+        hasattr(estimator, "fit") and parameter in signature(estimator.fit).parameters
     )
 
 

36 files would be reformatted, 886 files already formatted

_{Generated for commit: 55e5e0a. Link to the linter CI: here}

ogrisel · 2025-01-02T15:21:43Z

For information, for the examples of sample weight usage, we could be more specifice, for instance:

Survival Analysis: accelerated failure time modeling via a reduction to weighted linear regression
- AFT:
  - paper: https://www.sciencedirect.com/science/article/pii/S0047259X83710286?via%3Dihub
  - code: https://github.com/sebp/scikit-survival/
    blob/7c8bdb95fc5df4b24e19342d8021501cba3b0cf9/sksurv/linear_model/aft.py#L184-L185
- Hazardous' SurvivalBoost:
  - https://hal.science/hal-04617672
  - code: https://github.com/soda-inria/hazardous/blob/main/hazardous/_survival_boost.py#L444
Causal Inference: double Machine Learning
- paper: https://arxiv.org/abs/1712.04912
- code: https://github.com/py-why/EconML/blob/d995bf32f7867512d98461b544dba3e66b30a702/econml/dml/dml.py#L175-L181
Unfairness mitigation: exponentiated gradient reduction
- paper: http://proceedings.mlr.press/v80/agarwal18a.html
- code: https://github.com/fairlearn/fairlearn/blob/main/fairlearn/reductions/_exponentiated_gradient/_lagrangian.py#L222

But linking to the papers does not necessarily make it explicit that this can be implemented via scikit-learn's sample_weight and the code of those libraries is not always easy to understand without reading the papers first and furthermore I am not sure how to link to a source code snippet while making it easy to check that the link stays relevant over time.

virchan

Just wanted to offer a few alternative word choices for consideration.

doc/glossary.rst

lorentzenchr · 2025-01-03T17:25:05Z

Could you wait for my review before merging? Even if it takes 2 weeks?

lorentzenchr

Just a high-level review: This PR makes sample weights the longest entry in the glossary. I would rather prefer a concise entry.

ogrisel · 2025-01-08T09:04:31Z

Just a high-level review: This PR makes sample weights the longest entry in the glossary. I would rather prefer a concise entry.

I could move some of the details to a dedicated section of the user guide (maybe part of the estimator API) and keep a short version in the glossary with a link to the user guide for more details if people prefer.

lorentzenchr · 2025-01-10T17:43:53Z

FYI: In supervised learning, one of the main reasons for using weights, e.g. in insurance frequency models (Poisson loss), is to account for the different size/time/exposure/... of each observation. The assumptions are usually:

$Y=\frac{C}{w}$ is the ratio of interest, C is an amount (counts, EUR, kilogram, ...).
$Var[Y] \sim \frac{1}{w}$

In the end, the weights enter the loss. The idea is that the "correct" weights improve the efficiency of the estimation of the expected loss** (=empirical mean of per observation losses). However, to my knowledge, there is no direct connection between $Var[Y_i]$ and $Var[loss_i]$ , see Section 5.2.2 in https://arxiv.org/abs/2202.12780.

**expected loss = statistical risk

doc/glossary.rst

GaelVaroquaux

I would also mention reweighting for transfer learning, and define in one sentence what it means.

Also, I find that this paragraph is very useful, but either put it in a fold or in a section of the docs

StefanieSenger

Thanks @ogrisel. I have added a few comments.

doc/glossary.rst

Co-authored-by: Gael Varoquaux <gael.varoquaux@normalesup.org> Co-authored-by: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com>

ogrisel · 2025-02-24T13:55:53Z

Thanks for the reviews @StefanieSenger @GaelVaroquaux @lorentzenchr. I addressed all the specific feedback. Personally, I am fine with keeping all this info centralized into a largish glossary entry, but I can also split it into a dedicated section in the user guide (where?) and cross-reference if people prefer.

betatim

I think this is a good entry, even though it is quite long.

It would be nicer to have a short glossary entry and a dedicated user guide section. But I think we'd need to put in more work to make the user guide section. For example expanding on the third party paragraph from this PR with some links to examples of third-party estimators (I'm wondering if the text as it is, is too brief for people who aren't experts to understand it and too long for those who are experts/third-party implementers).

The final paragraph on KMeans and sampling is another one that I think we could expand in the user guide. At least I am not sure I fully understand what it is trying to tell me without an example.

However, maybe we should merge this now already and make an issue to improve the user-guide and reduce the glossary entry. Mostly because making the user guide entry would be quite a bit of additional work

adrinjalali · 2025-04-30T09:06:31Z

doc/glossary.rst

+        floats, so that sample weights are usually equivalent up to a constant
+        positive scaling factor.


equivalent to what? I don't understand this sentence.

Invariance to the scale of weights is actually not guaranteed for all estimators or hyperparameter choices. So let's be more generic.

Suggested change

floats, so that sample weights are usually equivalent up to a constant

positive scaling factor.

floats to express fractional relative importance of data points with respect

to one another others.

Do you mean to say this?

Suggested change

floats, so that sample weights are usually equivalent up to a constant

positive scaling factor.

floats, since for many estimators, only the relative values of weights matter,

not their absolute scale.

I did not want to explicitly state that because we are not yet sure to which extent this is true or not: we do haven't started testing this aspect systematically, but we know that this is not the case for many important ones such as LogisticRegression and Ridge. Both those estimators have a regularization parameter whose effect depends on the scale of the weights.

doc/glossary.rst

adrinjalali · 2025-04-30T09:10:23Z

doc/glossary.rst

+        Third-party libraries can also use `sample_weight`-compatible
+        estimators as building blocks to reduce a specific statistical task
+        into a weighted regression or classification task. For instance sample
+        weights can be constructed to adjust a time-to-event model for
+        censoring in a predictive survival analysis setting. In causal
+        inference, it is possible to reduce a conditional average treatment
+        effect estimation task to a weighted regression task under some
+        assumptions. Sample weights can also be used to mitigate
+        fairness-related harms based on a given quantitative definition of
+        fairness.


This paragraph doesn't feel like it belongs here.

Personally, I find this paragraph very important: it grounds an abstract concept into practical application cases. Have those use cases in mind is helpful both for users and contributors.

I agree with @adrinjalali, that this paragraph doesn't operate in the same scope as the rest of this glossary entry. A glossary is meant to define terms clearly and serve as a quick lookup.
In the user guide, this highlighting a specific use case would be great in a toggle or visually marked in a different style. Here however, it breaks the format and usability of the glossary.

Point taken: this means we will need a dedicated section in the doc to speak about the semantics and the practical use of the sample weights in more details.

doc/glossary.rst

DOC update and improve the sample_weight entry in the glossary

7771446

github-actions bot added the Documentation label Dec 31, 2024

ogrisel mentioned this pull request Dec 31, 2024

Fix sample weight passing in KBinsDiscretizer #29907

Merged

8 tasks

ogrisel mentioned this pull request Jan 2, 2025

List of estimators with known incorrect handling of sample_weight #16298

Open

54 tasks

ogrisel changed the title ~~DOC update and improve the sample_weight entry in the glossary~~ DOC update and improve the sample_weight entry in the glossary Jan 2, 2025

virchan reviewed Jan 3, 2025

View reviewed changes

doc/glossary.rst Outdated Show resolved Hide resolved

doc/glossary.rst Outdated Show resolved Hide resolved

doc/glossary.rst Outdated Show resolved Hide resolved

Improve phrasing

876261d

ogrisel commented Jan 3, 2025

View reviewed changes

doc/glossary.rst Outdated Show resolved Hide resolved

Typo

a93a221

ogrisel commented Jan 3, 2025

View reviewed changes

doc/glossary.rst Show resolved Hide resolved

lorentzenchr reviewed Jan 6, 2025

View reviewed changes

ogrisel mentioned this pull request Jan 10, 2025

SLEP006: default routing #26179

Open

GaelVaroquaux reviewed Jan 15, 2025

View reviewed changes

doc/glossary.rst Outdated Show resolved Hide resolved

GaelVaroquaux reviewed Jan 15, 2025

View reviewed changes

StefanieSenger reviewed Jan 16, 2025

View reviewed changes

doc/glossary.rst Outdated Show resolved Hide resolved

doc/glossary.rst Outdated Show resolved Hide resolved

doc/glossary.rst Outdated Show resolved Hide resolved

ogrisel and others added 4 commits February 24, 2025 14:32

Apply suggestions from code review

865eed7

Co-authored-by: Gael Varoquaux <gael.varoquaux@normalesup.org> Co-authored-by: Stefanie Senger <91849487+StefanieSenger@users.noreply.github.com>

Reorder paragraphs and reformat

1ece29e

Simpler phrasing

ef31eb9

Cross-reference metadata-routing

5e2d5f6

betatim approved these changes Feb 26, 2025

View reviewed changes

ogrisel added the Waiting for Second Reviewer First reviewer is done, need a second one! label Mar 11, 2025

ogrisel mentioned this pull request Mar 27, 2025

TargetEncoder should respect sample_weights #28881

Open

adrinjalali reviewed Apr 30, 2025

View reviewed changes

ogrisel mentioned this pull request Jun 16, 2025

FIX Draw indices using sample_weight in Bagging #31414

Merged

ogrisel commented Jun 18, 2025

View reviewed changes

doc/glossary.rst Outdated Show resolved Hide resolved

Apply suggestions from code review

d592044

ogrisel commented Jun 18, 2025

View reviewed changes

doc/glossary.rst Outdated Show resolved Hide resolved

Apply suggestions from code review

55e5e0a

		floats, so that sample weights are usually equivalent up to a constant
		positive scaling factor.

Uh oh!

DOC update and improve the sample_weight entry in the glossary #30564

Are you sure you want to change the base?

DOC update and improve the sample_weight entry in the glossary #30564

Uh oh!

Conversation

ogrisel commented Dec 31, 2024

Uh oh!

github-actions bot commented Dec 31, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

❌ Linting issues

ruff format

Uh oh!

ogrisel commented Jan 2, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

virchan left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

lorentzenchr commented Jan 3, 2025

Uh oh!

lorentzenchr left a comment

Choose a reason for hiding this comment

Uh oh!

ogrisel commented Jan 8, 2025

Uh oh!

lorentzenchr commented Jan 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

GaelVaroquaux left a comment • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

StefanieSenger left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

ogrisel commented Feb 24, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

betatim left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

DOC update and improve the `sample_weight` entry in the glossary #30564

DOC update and improve the `sample_weight` entry in the glossary #30564

github-actions bot commented Dec 31, 2024 •

edited

Loading

`ruff format`

ogrisel commented Jan 2, 2025 •

edited

Loading

lorentzenchr commented Jan 10, 2025 •

edited

Loading

GaelVaroquaux left a comment •

edited

Loading

ogrisel commented Feb 24, 2025 •

edited

Loading