diff --git a/doc/modules/neural_networks_supervised.rst b/doc/modules/neural_networks_supervised.rst
index 388f32e7c6925..100b4105be93d 100644
--- a/doc/modules/neural_networks_supervised.rst
+++ b/doc/modules/neural_networks_supervised.rst
@@ -146,7 +146,8 @@ See the examples below and the docstring of
 .. topic:: Examples:
 
  * :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`
- * :ref:`sphx_glr_auto_examples_neural_networks_plot_mnist_filters.py`
+ * See :ref:`sphx_glr_auto_examples_neural_networks_plot_mnist_filters.py` for
+   visualized representation of trained weights.
 
 Regression
 ==========
diff --git a/sklearn/neural_network/_multilayer_perceptron.py b/sklearn/neural_network/_multilayer_perceptron.py
index 5175247204fb8..cc419b57f2410 100644
--- a/sklearn/neural_network/_multilayer_perceptron.py
+++ b/sklearn/neural_network/_multilayer_perceptron.py
@@ -800,6 +800,9 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         - 'adam' refers to a stochastic gradient-based optimizer proposed
           by Kingma, Diederik, and Jimmy Ba
 
+        For a comparison between Adam optimizer and SGD, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`.
+
         Note: The default solver 'adam' works pretty well on relatively
         large datasets (with thousands of training samples or more) in terms of
         both training time and validation score.
@@ -810,6 +813,9 @@ class MLPClassifier(ClassifierMixin, BaseMultilayerPerceptron):
         Strength of the L2 regularization term. The L2 regularization term
         is divided by the sample size when added to the loss.
 
+        For an example usage and visualization of varying regularization, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_alpha.py`.
+
     batch_size : int, default='auto'
         Size of minibatches for stochastic optimizers.
         If the solver is 'lbfgs', the classifier will not use minibatch.
@@ -1293,6 +1299,9 @@ class MLPRegressor(RegressorMixin, BaseMultilayerPerceptron):
         - 'adam' refers to a stochastic gradient-based optimizer proposed by
           Kingma, Diederik, and Jimmy Ba
 
+        For a comparison between Adam optimizer and SGD, see
+        :ref:`sphx_glr_auto_examples_neural_networks_plot_mlp_training_curves.py`.
+
         Note: The default solver 'adam' works pretty well on relatively
         large datasets (with thousands of training samples or more) in terms of
         both training time and validation score.
diff --git a/sklearn/neural_network/_rbm.py b/sklearn/neural_network/_rbm.py
index ec819790c5f73..e3814f45d3633 100644
--- a/sklearn/neural_network/_rbm.py
+++ b/sklearn/neural_network/_rbm.py
@@ -127,6 +127,9 @@ class BernoulliRBM(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstima
     >>> model = BernoulliRBM(n_components=2)
     >>> model.fit(X)
     BernoulliRBM(n_components=2)
+
+    For a more detailed example usage, see
+    :ref:`sphx_glr_auto_examples_neural_networks_plot_rbm_logistic_classification.py`.
     """
 
     _parameter_constraints: dict = {