From cbdec3ac62f18187fc5c9b6bf2a839a32d6c315d Mon Sep 17 00:00:00 2001 From: loganthomas Date: Tue, 10 Jun 2025 19:41:56 -0500 Subject: [PATCH 1/9] doc: update CrossEntropyLoss with note and example of incorrect target specification --- torch/nn/modules/loss.py | 42 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 6fa0d53c8a448..4388d7a25f295 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1314,6 +1314,48 @@ class probabilities only when a single class label per minibatch item is too res >>> target = torch.randn(3, 5).softmax(dim=1) >>> output = loss(input, target) >>> output.backward() + + .. note:: + When providing `target` as class probabilities, it is expected that soft labels are passed + (i.e., `target` is a probability distribution over the possible classes for a given data sample + where each individual probability is between `[0,1]` and the distribution sums to `1`). + Hence the use of `.softmax()` on `target` in the above class probabilities example. + + PyTorch does not validate whether the values provided in `target` lie in the range `[0,1]` + or whether they sum to `1`. No warning will be raised and it is the user's responsibility + to ensure that `target` is a valid probability distribution. Providing arbitrary values + may yield misleading loss values and unstable gradients during training. + + Examples: + + >>> # Example of target with incorrectly specified class probabilities + >>> loss = nn.CrossEntropyLoss() + >>> input = torch.randn(3, 5, requires_grad=True) + >>> target = torch.randn(3, 5) + >>> # Provided target class probabilities are not in range [0,1] + >>> target + tensor([[-0.6846, 1.1029, -0.5028, 0.7858, -1.5158], + [ 0.6152, -0.3215, 0.4336, -0.3655, 0.5295], + [ 1.0120, 1.5263, -1.6144, -0.2656, -0.7357]]) + >>> # Provided target class probabilities do not sum to 1 + >>> target.sum(axis=1) + tensor([-0.8146, 0.8914, -0.0774]) + >>> loss(input, target).item() + -0.10109150409698486 + + >>> # Example of target with correctly specified class probabilities + >>> # Use .softmax() to ensure true probability distribution + >>> target_new = target.softmax(dim=1) + >>> # New target class probabilities all in range [0,1] + >>> target_new + tensor([[0.0772, 0.4610, 0.0925, 0.3357, 0.0336], + [0.2842, 0.1114, 0.2370, 0.1066, 0.2609], + [0.3127, 0.5230, 0.0226, 0.0872, 0.0545]]) + >>> # New target class probabilities sum to 1 + >>> target_new.sum(axis=1) + tensor([1.0000, 1.0000, 1.0000]) + >>> loss(input, target).item() + -0.10109150409698486 """ __constants__ = ["ignore_index", "reduction", "label_smoothing"] From 3be8f91014560842ed474f19e23329e1dcc33b40 Mon Sep 17 00:00:00 2001 From: loganthomas Date: Tue, 10 Jun 2025 19:48:25 -0500 Subject: [PATCH 2/9] doc: add short note to Target docstring --- torch/nn/modules/loss.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 4388d7a25f295..415cb0129a3ef 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1287,7 +1287,9 @@ class probabilities only when a single class label per minibatch item is too res :math:`K \geq 1` in the case of K-dimensional loss where each value should be between :math:`[0, C)`. The target data type is required to be long when using class indices. If containing class probabilities, the target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target - data type is required to be float when using class probabilities. + data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce + probability constraints on the class probabilities and that it is the user's responsibility to ensure + `target` is a valid probability distribution (see below examples section for more details). - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar. From 3da15ba7f9ffedad2ecdae2a453575343bad813f Mon Sep 17 00:00:00 2001 From: loganthomas Date: Tue, 10 Jun 2025 19:52:57 -0500 Subject: [PATCH 3/9] doc: add comment on no error message and fix new loss --- torch/nn/modules/loss.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 415cb0129a3ef..2e6fd29f003cf 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1342,6 +1342,7 @@ class probabilities only when a single class label per minibatch item is too res >>> # Provided target class probabilities do not sum to 1 >>> target.sum(axis=1) tensor([-0.8146, 0.8914, -0.0774]) + >>> # No error message and possible misleading loss value >>> loss(input, target).item() -0.10109150409698486 @@ -1356,8 +1357,8 @@ class probabilities only when a single class label per minibatch item is too res >>> # New target class probabilities sum to 1 >>> target_new.sum(axis=1) tensor([1.0000, 1.0000, 1.0000]) - >>> loss(input, target).item() - -0.10109150409698486 + >>> loss(input, target_new).item() + 2.048427104949951 """ __constants__ = ["ignore_index", "reduction", "label_smoothing"] From d0ab49e960c552eb14d0c2d364c22e513f413560 Mon Sep 17 00:00:00 2001 From: loganthomas Date: Wed, 11 Jun 2025 08:27:28 -0500 Subject: [PATCH 4/9] doc: add seed for reproducibility in docstring examples --- torch/nn/modules/loss.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 2e6fd29f003cf..0d349a20f2f66 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1332,33 +1332,34 @@ class probabilities only when a single class label per minibatch item is too res >>> # Example of target with incorrectly specified class probabilities >>> loss = nn.CrossEntropyLoss() + >>> torch.manual_seed(283) >>> input = torch.randn(3, 5, requires_grad=True) >>> target = torch.randn(3, 5) >>> # Provided target class probabilities are not in range [0,1] >>> target - tensor([[-0.6846, 1.1029, -0.5028, 0.7858, -1.5158], - [ 0.6152, -0.3215, 0.4336, -0.3655, 0.5295], - [ 1.0120, 1.5263, -1.6144, -0.2656, -0.7357]]) + tensor([[ 0.7105, 0.4446, 2.0297, 0.2671, -0.6075], + [-1.0496, -0.2753, -0.3586, 0.9270, 1.0027], + [ 0.7551, 0.1003, 1.3468, -0.3581, -0.9569]]) >>> # Provided target class probabilities do not sum to 1 >>> target.sum(axis=1) - tensor([-0.8146, 0.8914, -0.0774]) + tensor([2.8444, 0.2462, 0.8873]) >>> # No error message and possible misleading loss value >>> loss(input, target).item() - -0.10109150409698486 - + 4.6379876136779785 + >>> >>> # Example of target with correctly specified class probabilities >>> # Use .softmax() to ensure true probability distribution >>> target_new = target.softmax(dim=1) >>> # New target class probabilities all in range [0,1] >>> target_new - tensor([[0.0772, 0.4610, 0.0925, 0.3357, 0.0336], - [0.2842, 0.1114, 0.2370, 0.1066, 0.2609], - [0.3127, 0.5230, 0.0226, 0.0872, 0.0545]]) + tensor([[0.1559, 0.1195, 0.5830, 0.1000, 0.0417], + [0.0496, 0.1075, 0.0990, 0.3579, 0.3860], + [0.2607, 0.1355, 0.4711, 0.0856, 0.0471]]) >>> # New target class probabilities sum to 1 >>> target_new.sum(axis=1) tensor([1.0000, 1.0000, 1.0000]) >>> loss(input, target_new).item() - 2.048427104949951 + 2.55349063873291 """ __constants__ = ["ignore_index", "reduction", "label_smoothing"] From 501ff0eba5425ffd485be5fea94fbe025c6698e7 Mon Sep 17 00:00:00 2001 From: loganthomas Date: Wed, 11 Jun 2025 23:21:57 -0500 Subject: [PATCH 5/9] doc: update wording and use func for softmax --- torch/nn/modules/loss.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 0d349a20f2f66..0b8ab5dd18667 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1289,7 +1289,7 @@ class probabilities only when a single class label per minibatch item is too res target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce probability constraints on the class probabilities and that it is the user's responsibility to ensure - `target` is a valid probability distribution (see below examples section for more details). + `target` contains valid probability distributions (see below examples section for more details). - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar. @@ -1319,14 +1319,15 @@ class probabilities only when a single class label per minibatch item is too res .. note:: When providing `target` as class probabilities, it is expected that soft labels are passed - (i.e., `target` is a probability distribution over the possible classes for a given data sample + (i.e., `target` contains probability distributions over the possible classes for a given data sample where each individual probability is between `[0,1]` and the distribution sums to `1`). - Hence the use of `.softmax()` on `target` in the above class probabilities example. + Hence the use of :func:`softmax()` on ``target`` in the above class probabilities example. PyTorch does not validate whether the values provided in `target` lie in the range `[0,1]` - or whether they sum to `1`. No warning will be raised and it is the user's responsibility - to ensure that `target` is a valid probability distribution. Providing arbitrary values - may yield misleading loss values and unstable gradients during training. + or whether the distribution of each data sample sums to `1`. + No warning will be raised and it is the user's responsibility + to ensure that `target` contains valid probability distributions. + Providing arbitrary values may yield misleading loss values and unstable gradients during training. Examples: From a958e4db1e6a9566c52886e8edc98507b4b7d3fa Mon Sep 17 00:00:00 2001 From: loganthomas Date: Wed, 11 Jun 2025 23:24:40 -0500 Subject: [PATCH 6/9] nit: better wording --- torch/nn/modules/loss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 0b8ab5dd18667..6e9de6e7f1d87 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1319,7 +1319,7 @@ class probabilities only when a single class label per minibatch item is too res .. note:: When providing `target` as class probabilities, it is expected that soft labels are passed - (i.e., `target` contains probability distributions over the possible classes for a given data sample + (i.e., `target` contains probability distributions over the possible classes for each data sample where each individual probability is between `[0,1]` and the distribution sums to `1`). Hence the use of :func:`softmax()` on ``target`` in the above class probabilities example. From 9fa9dcbafe1219c4f1310283244bd4ba6f9fd126 Mon Sep 17 00:00:00 2001 From: loganthomas Date: Fri, 13 Jun 2025 07:17:28 -0500 Subject: [PATCH 7/9] doc: better wording --- torch/nn/modules/loss.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 6e9de6e7f1d87..52fd9efc9f540 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1318,10 +1318,10 @@ class probabilities only when a single class label per minibatch item is too res >>> output.backward() .. note:: - When providing `target` as class probabilities, it is expected that soft labels are passed - (i.e., `target` contains probability distributions over the possible classes for each data sample - where each individual probability is between `[0,1]` and the distribution sums to `1`). - Hence the use of :func:`softmax()` on ``target`` in the above class probabilities example. + When `target` contains class probabilities, it should consist of soft labels—that is, + each `target` entry should represent a probability distribution over the possible classes for a given data sample, + with individual probabilities between `[0,1]` and the total distribution summing to 1. + This is why the :func:`softmax()` function is applied to the ``target`` in the class probabilities example above. PyTorch does not validate whether the values provided in `target` lie in the range `[0,1]` or whether the distribution of each data sample sums to `1`. From f3038b323da3df70776b28653395aba2712e86b1 Mon Sep 17 00:00:00 2001 From: Svetlana Karslioglu Date: Tue, 17 Jun 2025 13:12:20 -0700 Subject: [PATCH 8/9] Apply suggestions from code review Co-authored-by: mikaylagawarecki --- torch/nn/modules/loss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index 52fd9efc9f540..d1f375b304dfa 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1289,7 +1289,7 @@ class probabilities only when a single class label per minibatch item is too res target must be the same shape input, and each value should be between :math:`[0, 1]`. This means the target data type is required to be float when using class probabilities. Note that PyTorch does not strictly enforce probability constraints on the class probabilities and that it is the user's responsibility to ensure - `target` contains valid probability distributions (see below examples section for more details). + ``target`` contains valid probability distributions (see below examples section for more details). - Output: If reduction is 'none', shape :math:`()`, :math:`(N)` or :math:`(N, d_1, d_2, ..., d_K)` with :math:`K \geq 1` in the case of K-dimensional loss, depending on the shape of the input. Otherwise, scalar. @@ -1318,7 +1318,7 @@ class probabilities only when a single class label per minibatch item is too res >>> output.backward() .. note:: - When `target` contains class probabilities, it should consist of soft labels—that is, + When ``target`` contains class probabilities, it should consist of soft labels—that is, each `target` entry should represent a probability distribution over the possible classes for a given data sample, with individual probabilities between `[0,1]` and the total distribution summing to 1. This is why the :func:`softmax()` function is applied to the ``target`` in the class probabilities example above. From 8bf57b51a07532d0f264be05aa7c7d04db710aae Mon Sep 17 00:00:00 2001 From: mikaylagawarecki Date: Tue, 17 Jun 2025 16:14:11 -0400 Subject: [PATCH 9/9] Fix more backticks --- torch/nn/modules/loss.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torch/nn/modules/loss.py b/torch/nn/modules/loss.py index d1f375b304dfa..bdbaaed651192 100644 --- a/torch/nn/modules/loss.py +++ b/torch/nn/modules/loss.py @@ -1319,14 +1319,14 @@ class probabilities only when a single class label per minibatch item is too res .. note:: When ``target`` contains class probabilities, it should consist of soft labels—that is, - each `target` entry should represent a probability distribution over the possible classes for a given data sample, - with individual probabilities between `[0,1]` and the total distribution summing to 1. + each ``target`` entry should represent a probability distribution over the possible classes for a given data sample, + with individual probabilities between ``[0,1]`` and the total distribution summing to 1. This is why the :func:`softmax()` function is applied to the ``target`` in the class probabilities example above. - PyTorch does not validate whether the values provided in `target` lie in the range `[0,1]` - or whether the distribution of each data sample sums to `1`. + PyTorch does not validate whether the values provided in ``target`` lie in the range ``[0,1]`` + or whether the distribution of each data sample sums to ``1``. No warning will be raised and it is the user's responsibility - to ensure that `target` contains valid probability distributions. + to ensure that ``target`` contains valid probability distributions. Providing arbitrary values may yield misleading loss values and unstable gradients during training. Examples: