diff --git a/sklearn/decomposition/_dict_learning.py b/sklearn/decomposition/_dict_learning.py
index 5ba8b628316ab..7fc0915f2ea8e 100644
--- a/sklearn/decomposition/_dict_learning.py
+++ b/sklearn/decomposition/_dict_learning.py
@@ -1692,7 +1692,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
     >>> from sklearn.datasets import make_sparse_coded_signal
     >>> from sklearn.decomposition import DictionaryLearning
     >>> X, dictionary, code = make_sparse_coded_signal(
-    ...     n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,
+    ...     n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10,
     ...     random_state=42,
     ... )
     >>> dict_learner = DictionaryLearning(
@@ -1704,7 +1704,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
     We can check the level of sparsity of `X_transformed`:
 
     >>> np.mean(X_transformed == 0)
-    0.41...
+    0.52...
 
     We can compare the average squared euclidean norm of the reconstruction
     error of the sparse coded signal relative to the squared euclidean norm of
@@ -1712,7 +1712,7 @@ class DictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     >>> X_hat = X_transformed @ dict_learner.components_
     >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))
-    0.07...
+    0.05...
     """
 
     _parameter_constraints: dict = {
@@ -2062,16 +2062,16 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
     >>> from sklearn.datasets import make_sparse_coded_signal
     >>> from sklearn.decomposition import MiniBatchDictionaryLearning
     >>> X, dictionary, code = make_sparse_coded_signal(
-    ...     n_samples=100, n_components=15, n_features=20, n_nonzero_coefs=10,
+    ...     n_samples=30, n_components=15, n_features=20, n_nonzero_coefs=10,
     ...     random_state=42)
     >>> dict_learner = MiniBatchDictionaryLearning(
     ...     n_components=15, batch_size=3, transform_algorithm='lasso_lars',
-    ...     transform_alpha=0.1, random_state=42)
+    ...     transform_alpha=0.1, max_iter=20, random_state=42)
     >>> X_transformed = dict_learner.fit_transform(X)
 
     We can check the level of sparsity of `X_transformed`:
 
-    >>> np.mean(X_transformed == 0) < 0.5
+    >>> np.mean(X_transformed == 0) > 0.5
     True
 
     We can compare the average squared euclidean norm of the reconstruction
@@ -2080,7 +2080,7 @@ class MiniBatchDictionaryLearning(_BaseSparseCoding, BaseEstimator):
 
     >>> X_hat = X_transformed @ dict_learner.components_
     >>> np.mean(np.sum((X_hat - X) ** 2, axis=1) / np.sum(X ** 2, axis=1))
-    0.057...
+    0.052...
     """
 
     _parameter_constraints: dict = {
diff --git a/sklearn/decomposition/tests/test_dict_learning.py b/sklearn/decomposition/tests/test_dict_learning.py
index 0986bc6b3feed..b98d75eccbee3 100644
--- a/sklearn/decomposition/tests/test_dict_learning.py
+++ b/sklearn/decomposition/tests/test_dict_learning.py
@@ -43,7 +43,7 @@ def test_sparse_encode_shapes_omp():
     for n_components, n_samples in itertools.product([1, 5], [1, 9]):
         X_ = rng.randn(n_samples, n_features)
         dictionary = rng.randn(n_components, n_features)
-        for algorithm, n_jobs in itertools.product(algorithms, [1, 3]):
+        for algorithm, n_jobs in itertools.product(algorithms, [1, 2]):
             code = sparse_encode(X_, dictionary, algorithm=algorithm, n_jobs=n_jobs)
             assert code.shape == (n_samples, n_components)
 
diff --git a/sklearn/decomposition/tests/test_sparse_pca.py b/sklearn/decomposition/tests/test_sparse_pca.py
index 4abbbd515aeb9..848d5d9d7ee34 100644
--- a/sklearn/decomposition/tests/test_sparse_pca.py
+++ b/sklearn/decomposition/tests/test_sparse_pca.py
@@ -120,12 +120,12 @@ def test_initialization():
 def test_mini_batch_correct_shapes():
     rng = np.random.RandomState(0)
     X = rng.randn(12, 10)
-    pca = MiniBatchSparsePCA(n_components=8, random_state=rng)
+    pca = MiniBatchSparsePCA(n_components=8, max_iter=1, random_state=rng)
     U = pca.fit_transform(X)
     assert pca.components_.shape == (8, 10)
     assert U.shape == (12, 8)
     # test overcomplete decomposition
-    pca = MiniBatchSparsePCA(n_components=13, random_state=rng)
+    pca = MiniBatchSparsePCA(n_components=13, max_iter=1, random_state=rng)
     U = pca.fit_transform(X)
     assert pca.components_.shape == (13, 10)
     assert U.shape == (12, 13)
diff --git a/sklearn/ensemble/tests/test_gradient_boosting.py b/sklearn/ensemble/tests/test_gradient_boosting.py
index 768063286e46d..f721767b96aa7 100644
--- a/sklearn/ensemble/tests/test_gradient_boosting.py
+++ b/sklearn/ensemble/tests/test_gradient_boosting.py
@@ -99,10 +99,15 @@ def test_classification_toy(loss, global_random_seed):
 def test_classification_synthetic(loss, global_random_seed):
     # Test GradientBoostingClassifier on synthetic dataset used by
     # Hastie et al. in ESLII - Figure 10.9
-    X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=global_random_seed)
+    # Note that Figure 10.9 reuses the dataset generated for figure 10.2
+    # and should have 2_000 train data points and 10_000 test data points.
+    # Here we intentionally use a smaller variant to make the test run faster,
+    # but the conclusions are still the same, despite the smaller datasets.
+    X, y = datasets.make_hastie_10_2(n_samples=2000, random_state=global_random_seed)
 
-    X_train, X_test = X[:2000], X[2000:]
-    y_train, y_test = y[:2000], y[2000:]
+    split_idx = 500
+    X_train, X_test = X[:split_idx], X[split_idx:]
+    y_train, y_test = y[:split_idx], y[split_idx:]
 
     # Increasing the number of trees should decrease the test error
     common_params = {
@@ -111,13 +116,13 @@ def test_classification_synthetic(loss, global_random_seed):
         "loss": loss,
         "random_state": global_random_seed,
     }
-    gbrt_100_stumps = GradientBoostingClassifier(n_estimators=100, **common_params)
-    gbrt_100_stumps.fit(X_train, y_train)
+    gbrt_10_stumps = GradientBoostingClassifier(n_estimators=10, **common_params)
+    gbrt_10_stumps.fit(X_train, y_train)
 
-    gbrt_200_stumps = GradientBoostingClassifier(n_estimators=200, **common_params)
-    gbrt_200_stumps.fit(X_train, y_train)
+    gbrt_50_stumps = GradientBoostingClassifier(n_estimators=50, **common_params)
+    gbrt_50_stumps.fit(X_train, y_train)
 
-    assert gbrt_100_stumps.score(X_test, y_test) < gbrt_200_stumps.score(X_test, y_test)
+    assert gbrt_10_stumps.score(X_test, y_test) < gbrt_50_stumps.score(X_test, y_test)
 
     # Decision stumps are better suited for this dataset with a large number of
     # estimators.
diff --git a/sklearn/neighbors/tests/test_neighbors.py b/sklearn/neighbors/tests/test_neighbors.py
index 49c0b97730bd2..5a08774a7912b 100644
--- a/sklearn/neighbors/tests/test_neighbors.py
+++ b/sklearn/neighbors/tests/test_neighbors.py
@@ -74,7 +74,6 @@
 )  # type: ignore
 
 P = (1, 2, 3, 4, np.inf)
-JOBLIB_BACKENDS = list(joblib.parallel.BACKENDS.keys())
 
 # Filter deprecation warnings.
 neighbors.kneighbors_graph = ignore_warnings(neighbors.kneighbors_graph)
@@ -2044,10 +2043,10 @@ def test_same_radius_neighbors_parallel(algorithm):
     assert_allclose(graph, graph_parallel)
 
 
-@pytest.mark.parametrize("backend", JOBLIB_BACKENDS)
+@pytest.mark.parametrize("backend", ["threading", "loky"])
 @pytest.mark.parametrize("algorithm", ALGORITHMS)
 def test_knn_forcing_backend(backend, algorithm):
-    # Non-regression test which ensure the knn methods are properly working
+    # Non-regression test which ensures the knn methods are properly working
     # even when forcing the global joblib backend.
     with joblib.parallel_backend(backend):
         X, y = datasets.make_classification(
@@ -2056,12 +2055,12 @@ def test_knn_forcing_backend(backend, algorithm):
         X_train, X_test, y_train, y_test = train_test_split(X, y)
 
         clf = neighbors.KNeighborsClassifier(
-            n_neighbors=3, algorithm=algorithm, n_jobs=3
+            n_neighbors=3, algorithm=algorithm, n_jobs=2
         )
         clf.fit(X_train, y_train)
         clf.predict(X_test)
         clf.kneighbors(X_test)
-        clf.kneighbors_graph(X_test, mode="distance").toarray()
+        clf.kneighbors_graph(X_test, mode="distance")
 
 
 def test_dtype_convert():
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 31c4ce714a7f6..6d1719089cd39 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -804,10 +804,10 @@ def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input(
     )
 
 
-def test_min_impurity_decrease():
+def test_min_impurity_decrease(global_random_seed):
     # test if min_impurity_decrease ensure that a split is made only if
     # if the impurity decrease is at least that value
-    X, y = datasets.make_classification(n_samples=10000, random_state=42)
+    X, y = datasets.make_classification(n_samples=100, random_state=global_random_seed)
 
     # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
     # by setting max_leaf_nodes