diff --git a/sklearn/cluster/tests/test_k_means.py b/sklearn/cluster/tests/test_k_means.py
index 4e71cd1f35645..cbac3974c1e11 100644
--- a/sklearn/cluster/tests/test_k_means.py
+++ b/sklearn/cluster/tests/test_k_means.py
@@ -339,19 +339,6 @@ def test_fortran_aligned_data(Estimator):
 )
 def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
     # check that fit.predict gives same result as fit_predict
-    # There's a very small chance of failure with elkan on unstructured dataset
-    # because predict method uses fast euclidean distances computation which
-    # may cause small numerical instabilities.
-    # NB: This test is largely redundant with respect to test_predict and
-    #     test_predict_equal_labels.  This test has the added effect of
-    #     testing idempotence of the fittng procesdure which appears to
-    #     be where it fails on some MacOS setups.
-    if sys.platform == "darwin":
-        pytest.xfail(
-            "Known failures on MacOS, See "
-            "https://github.com/scikit-learn/scikit-learn/issues/12644"
-        )
-
     rng = np.random.RandomState(seed)
 
     X = make_blobs(n_samples=1000, n_features=10, centers=10, random_state=rng)[
@@ -365,12 +352,7 @@ def test_k_means_fit_predict(algo, dtype, constructor, seed, max_iter, tol):
 
     labels_1 = kmeans.fit(X).predict(X)
     labels_2 = kmeans.fit_predict(X)
-
-    # Due to randomness in the order in which chunks of data are processed when
-    # using more than one thread, the absolute values of the labels can be
-    # different between the 2 strategies but they should correspond to the same
-    # clustering.
-    assert v_measure_score(labels_1, labels_2) == pytest.approx(1, abs=1e-15)
+    assert_array_equal(labels_1, labels_2)
 
 
 def test_minibatch_kmeans_verbose():
@@ -633,21 +615,10 @@ def test_score_max_iter(Estimator):
 def test_predict(Estimator, algorithm, init, dtype, array_constr):
     # Check the predict method and the equivalence between fit.predict and
     # fit_predict.
-
-    # There's a very small chance of failure with elkan on unstructured dataset
-    # because predict method uses fast euclidean distances computation which
-    # may cause small numerical instabilities.
-    if sys.platform == "darwin":
-        pytest.xfail(
-            "Known failures on MacOS, See "
-            "https://github.com/scikit-learn/scikit-learn/issues/12644"
-        )
-
     X, _ = make_blobs(n_samples=500, n_features=10, centers=10, random_state=0)
     X = array_constr(X)
 
-    # With n_init = 1
-    km = Estimator(n_clusters=10, init=init, n_init=1, random_state=0)
+    km = Estimator(n_clusters=10, init=init, n_init=10, random_state=0)
     if algorithm is not None:
         km.set_params(algorithm=algorithm)
     km.fit(X)
@@ -665,31 +636,6 @@ def test_predict(Estimator, algorithm, init, dtype, array_constr):
     pred = km.predict(km.cluster_centers_)
     assert_array_equal(pred, np.arange(10))
 
-    # With n_init > 1
-    # Due to randomness in the order in which chunks of data are processed when
-    # using more than one thread, there might be different rounding errors for
-    # the computation of the inertia between 2 runs. This might result in a
-    # different ranking of 2 inits, hence a different labeling, even if they
-    # give the same clustering. We only check the labels up to a permutation.
-
-    km = Estimator(n_clusters=10, init=init, n_init=10, random_state=0)
-    if algorithm is not None:
-        km.set_params(algorithm=algorithm)
-    km.fit(X)
-    labels = km.labels_
-
-    # re-predict labels for training set using predict
-    pred = km.predict(X)
-    assert_allclose(v_measure_score(pred, labels), 1)
-
-    # re-predict labels for training set using fit_predict
-    pred = km.fit_predict(X)
-    assert_allclose(v_measure_score(pred, labels), 1)
-
-    # predict centroid labels
-    pred = km.predict(km.cluster_centers_)
-    assert_allclose(v_measure_score(pred, np.arange(10)), 1)
-
 
 @pytest.mark.parametrize("Estimator", [KMeans, MiniBatchKMeans])
 def test_dense_sparse(Estimator):
@@ -745,7 +691,7 @@ def test_integer_input(Estimator, array_constr, dtype, init):
     assert km.cluster_centers_.dtype == np.float64
 
     expected_labels = [0, 1, 1, 0, 0, 1]
-    assert_allclose(v_measure_score(km.labels_, expected_labels), 1)
+    assert_array_equal(km.labels_, expected_labels)
 
     # Same with partial_fit (#14314)
     if Estimator is MiniBatchKMeans: