[MPS] Move max_pool2d to Metal for stride != 1 (#157876)

kurtamohler · pytorchmergebot · commit 86eb65f7f060 · 2025-08-08T16:40:10.000Z
This PR updates `max_pool2d` to use a Metal kernel instead of the old MPS graph impl. However, when the `stride` argument is 1 in all dimensions, the old implementation gives significantly better performance, so we fall back to it in that case. Below is a performance comparison of `max_pool2d` before and after this PR, obtained from this script: https://github.com/kurtamohler/pytorch-perf-test-scripts/blob/2f02f2bf7ad8e1b80d8eb728612b179d48fe92d7/max_pool_mps/perf.py <details><summary>Click to expand</summary> case | before PR | after PR | speedup | | case info -- | -- | -- | -- | -- | -- 0 | 0.014264 | 0.004473 | 3.188911245 | | (3, 2, 2), {'kernel_size': 2, 'return_indices': True} 1 | 0.010752 | 0.00421 | 2.55391924 | | (3, 2, 2), {'kernel_size': 2, 'return_indices': False} 2 | 0.020777 | 0.006123 | 3.393271272 | | (3, 10, 10), {'kernel_size': 5, 'return_indices': True} 3 | 0.011065 | 0.005759 | 1.921340511 | | (3, 10, 10), {'kernel_size': 5, 'return_indices': False} 4 | 0.01452 | 0.007829 | 1.854642994 | | (3, 100, 100), {'kernel_size': 5, 'return_indices': True} 5 | 0.009258 | 0.007075 | 1.308551237 | | (3, 100, 100), {'kernel_size': 5, 'return_indices': False} 6 | 0.188137 | 0.168688 | 1.115295694 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': True} 7 | 0.161362 | 0.154746 | 1.042753932 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': False} 8 | 0.182883 | 0.16945 | 1.079274122 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': True} 9 | 0.156875 | 0.163346 | 0.9603847049 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': False} 10 | 0.193433 | 0.167396 | 1.155541351 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': True} 11 | 0.158967 | 0.151246 | 1.051049284 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': False} 12 | 0.931071 | 0.932883 | 0.9980576342 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': True} 13 | 0.324496 | 0.3252 | 0.9978351784 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': False} 14 | 0.944071 | 0.936246 | 1.008357846 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': True} 15 | 0.322171 | 0.314854 | 1.023239343 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': False} 16 | 0.894158 | 0.886408 | 1.008743152 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': True} 17 | 0.309338 | 0.304146 | 1.017070749 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': False} 18 | 0.606 | 0.260546 | 2.325884873 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': True} 19 | 0.30445 | 0.231054 | 1.317657344 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': False} 20 | 0.474708 | 0.261925 | 1.812381407 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': True} 21 | 0.23175 | 0.231883 | 0.9994264349 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': False} 22 | 0.434475 | 0.266246 | 1.631855502 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': True} 23 | 0.236942 | 0.231792 | 1.022218196 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': False} 24 | 0.202396 | 0.174888 | 1.157289237 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': True} 25 | 0.160679 | 0.158246 | 1.015374796 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': False} 26 | 0.200354 | 0.184133 | 1.088093932 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': True} 27 | 0.160779 | 0.160679 | 1.000622359 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': False} 28 | 0.199175 | 0.178625 | 1.115045486 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': True} 29 | 0.159458 | 0.160883 | 0.9911426316 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': False} 30 | 0.199021 | 0.165329 | 1.203787599 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': True} 31 | 0.156337 | 0.158213 | 0.9881425673 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': False} 32 | 0.180146 | 0.174483 | 1.032455884 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': True} 33 | 0.156988 | 0.158167 | 0.9925458534 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': False} 34 | 0.182133 | 0.176521 | 1.031792251 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': True} 35 | 0.169042 | 0.156483 | 1.080257919 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': False} 36 | 1.767821 | 1.766254 | 1.000887188 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': True} 37 | 1.059346 | 1.058775 | 1.000539302 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': False} 38 | 1.85755 | 1.859429 | 0.9989894747 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': True} 39 | 1.100417 | 1.097683 | 1.002490701 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': False} 40 | 1.843167 | 1.847558 | 0.9976233493 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': True} 41 | 1.090142 | 1.093163 | 0.9972364597 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': False} 42 | 0.480867 | 0.251733 | 1.910226311 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': True} 43 | 0.319246 | 0.236479 | 1.349997251 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': False} 44 | 0.49315 | 0.256408 | 1.923301925 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': True} 45 | 0.316746 | 0.227854 | 1.390127011 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': False} 46 | 0.4912 | 0.257762 | 1.905633879 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': True} 47 | 0.324771 | 0.229371 | 1.41592006 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': False} 48 | 0.152904 | 0.095079 | 1.608178462 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': True} 49 | 0.102963 | 0.089217 | 1.154073775 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': False} 50 | 0.155158 | 0.095429 | 1.625899884 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': True} 51 | 0.104338 | 0.089979 | 1.15958168 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': False} 52 | 0.153121 | 0.096429 | 1.587914424 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': True} 53 | 0.103642 | 0.090254 | 1.148336916 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': False} 54 | 0.191071 | 0.165125 | 1.157129447 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': True} 55 | 0.153971 | 0.149021 | 1.033216795 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': False} 56 | 0.193192 | 0.166892 | 1.157586942 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': True} 57 | 0.156617 | 0.15215 | 1.029359185 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': False} 58 | 0.178033 | 0.167308 | 1.06410333 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': True} 59 | 0.157425 | 0.164404 | 0.9575496947 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': False} 60 | 1.757638 | 1.750896 | 1.0038506 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': True} 61 | 1.048471 | 1.047967 | 1.000480931 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': False} 62 | 1.790708 | 1.789767 | 1.000525767 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': True} 63 | 1.054575 | 1.054796 | 0.9997904808 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': False} 64 | 1.785837 | 1.784192 | 1.000921986 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': True} 65 | 1.054713 | 1.054492 | 1.00020958 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': False} 66 | 0.478267 | 0.261017 | 1.832321266 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': True} 67 | 0.32005 | 0.226654 | 1.412064204 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': False} 68 | 0.484008 | 0.254721 | 1.900149575 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': True} 69 | 0.321 | 0.218842 | 1.466811672 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': False} 70 | 0.482087 | 0.248771 | 1.937874591 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': True} 71 | 0.316558 | 0.230533 | 1.373156988 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': False} 72 | 0.137842 | 0.085088 | 1.619993419 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': True} 73 | 0.100671 | 0.0769 | 1.309115735 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': False} 74 | 0.148321 | 0.086967 | 1.705485989 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': True} 75 | 0.101392 | 0.075454 | 1.343759112 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': False} 76 | 0.150208 | 0.083742 | 1.793699697 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': True} 77 | 0.099587 | 0.075825 | 1.313379492 | | (3, 1000, 1000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': False} 78 | 0.622546 | 0.602729 | 1.03287879 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': True} 79 | 0.531696 | 0.5067 | 1.049330965 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 0, 'return_indices': False} 80 | 0.626646 | 0.617038 | 1.015571164 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': True} 81 | 0.530354 | 0.525367 | 1.009492412 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 1, 'return_indices': False} 82 | 0.633933 | 0.577775 | 1.097197006 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': True} 83 | 0.533067 | 0.526954 | 1.011600633 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': None, 'padding': 2, 'return_indices': False} 84 | 3.372867 | 3.386412 | 0.9960001914 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': True} 85 | 1.155975 | 1.156604 | 0.9994561665 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 0, 'return_indices': False} 86 | 3.401921 | 3.39755 | 1.001286515 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': True} 87 | 1.202829 | 1.192538 | 1.008629494 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 1, 'return_indices': False} 88 | 3.23675 | 3.220238 | 1.005127571 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': True} 89 | 1.077067 | 1.085613 | 0.9921279498 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 1, 'padding': 2, 'return_indices': False} 90 | 1.572925 | 0.925625 | 1.699311276 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': True} 91 | 0.791204 | 0.793454 | 0.9971642969 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 0, 'return_indices': False} 92 | 1.572742 | 0.922729 | 1.704446268 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': True} 93 | 0.784292 | 0.788871 | 0.9941955022 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 1, 'return_indices': False} 94 | 1.526546 | 0.925708 | 1.649057802 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': True} 95 | 0.769321 | 0.787675 | 0.9766985114 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 2, 'padding': 2, 'return_indices': False} 96 | 0.736033 | 0.612808 | 1.201082558 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': True} 97 | 0.574625 | 0.530925 | 1.082309177 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 0, 'return_indices': False} 98 | 0.722021 | 0.614488 | 1.174996094 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': True} 99 | 0.563171 | 0.533721 | 1.055178642 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 1, 'return_indices': False} 100 | 0.735725 | 0.613992 | 1.198264798 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': True} 101 | 0.583487 | 0.532513 | 1.095723485 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 1, 'stride': 4, 'padding': 2, 'return_indices': False} 102 | 0.656383 | 0.575313 | 1.140914598 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': True} 103 | 0.559796 | 0.509079 | 1.099625009 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 0, 'return_indices': False} 104 | 0.662046 | 0.572362 | 1.156691045 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': True} 105 | 0.552633 | 0.508671 | 1.086425214 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 1, 'return_indices': False} 106 | 0.634108 | 0.574629 | 1.103508525 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': True} 107 | 0.534013 | 0.510996 | 1.045043405 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': None, 'padding': 2, 'return_indices': False} 108 | 7.056642 | 7.066717 | 0.9985743026 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': True} 109 | 4.144275 | 4.142658 | 1.000390329 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 0, 'return_indices': False} 110 | 7.172683 | 7.189867 | 0.9976099697 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': True} 111 | 4.162538 | 4.158875 | 1.000880767 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 1, 'return_indices': False} 112 | 7.194233 | 7.181837 | 1.001726021 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': True} 113 | 4.294083 | 4.196062 | 1.023360236 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 1, 'padding': 2, 'return_indices': False} 114 | 1.875692 | 0.891071 | 2.104986022 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': True} 115 | 1.097479 | 0.781175 | 1.404907991 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 0, 'return_indices': False} 116 | 1.8883 | 0.89015 | 2.121327866 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': True} 117 | 1.101329 | 0.778542 | 1.414604479 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 1, 'return_indices': False} 118 | 1.872833 | 0.893654 | 2.095702587 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': True} 119 | 1.096712 | 0.784579 | 1.397835017 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 2, 'padding': 2, 'return_indices': False} 120 | 0.513029 | 0.374417 | 1.370207549 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': True} 121 | 0.349546 | 0.305763 | 1.143192603 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 0, 'return_indices': False} 122 | 0.518929 | 0.377487 | 1.374693698 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': True} 123 | 0.364662 | 0.3145 | 1.159497615 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 1, 'return_indices': False} 124 | 0.521275 | 0.375242 | 1.389170189 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': True} 125 | 0.367488 | 0.308354 | 1.191773092 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 2, 'stride': 4, 'padding': 2, 'return_indices': False} 126 | 0.652342 | 0.569308 | 1.145850752 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': True} 127 | 0.555696 | 0.506892 | 1.096280865 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 0, 'return_indices': False} 128 | 0.654333 | 0.570367 | 1.147213987 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': True} 129 | 0.548925 | 0.505825 | 1.085207335 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 1, 'return_indices': False} 130 | 0.655908 | 0.571904 | 1.146884792 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': True} 131 | 0.560808 | 0.508238 | 1.103435792 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': None, 'padding': 2, 'return_indices': False} 132 | 6.949462 | 6.949112 | 1.000050366 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': True} 133 | 4.072913 | 4.065013 | 1.001943413 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 0, 'return_indices': False} 134 | 7.200896 | 7.197792 | 1.000431243 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': True} 135 | 4.291367 | 4.218538 | 1.017264038 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 1, 'return_indices': False} 136 | 7.1823 | 7.306933 | 0.9829431856 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': True} 137 | 4.151175 | 4.149592 | 1.000381483 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 1, 'padding': 2, 'return_indices': False} 138 | 1.781279 | 0.884288 | 2.014365229 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': True} 139 | 1.050804 | 0.774362 | 1.356993241 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 0, 'return_indices': False} 140 | 1.860758 | 0.884637 | 2.103414169 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': True} 141 | 1.099908 | 0.775887 | 1.417613647 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 1, 'return_indices': False} 142 | 1.857387 | 0.885738 | 2.096993693 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': True} 143 | 1.105279 | 0.77365 | 1.428655077 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 2, 'padding': 2, 'return_indices': False} 144 | 0.489408 | 0.269583 | 1.815426047 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': True} 145 | 0.322525 | 0.236979 | 1.360985573 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 0, 'return_indices': False} 146 | 0.515475 | 0.265813 | 1.93923924 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': True} 147 | 0.315525 | 0.228146 | 1.382995976 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 1, 'return_indices': False} 148 | 0.503438 | 0.277204 | 1.816128194 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': True} 149 | 0.335421 | 0.228275 | 1.469372467 | | (3, 2000, 2000), {'kernel_size': 5, 'dilation': 4, 'stride': 4, 'padding': 2, 'return_indices': False} 150 | 5.72495 | 4.909554 | 1.166083518 | | (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': None, 'return_indices': True} 151 | 4.45215 | 4.251333 | 1.047236243 | | (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': None, 'return_indices': False} 152 | 29.953021 | 29.879879 | 1.002447868 | | (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': True} 153 | 9.854683 | 9.839517 | 1.001541336 | | (10, 10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': False} 154 | 6.178033 | 5.697375 | 1.084364817 | | (10, 10, 1000, 1000), {'kernel_size': 100, 'padding': 50, 'return_indices': True} 155 | 6.280317 | 5.712525 | 1.099394226 | | (10, 10, 1000, 1000), {'kernel_size': 100, 'padding': 50, 'return_indices': False} 156 | 10.256062 | 11.336527 | 0.9046917103 | | (10, 10, 1000, 1000), {'kernel_size': 250, 'padding': 50, 'return_indices': True} 157 | 9.469546 | 11.33705 | 0.8352742556 | | (10, 10, 1000, 1000), {'kernel_size': 250, 'padding': 50, 'return_indices': False} 158 | 0.119087 | 0.0797 | 1.494190715 | | (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': True} 159 | 0.098713 | 0.047173 | 2.092574142 | | (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': False} 160 | 0.960812 | 0.675762 | 1.421820108 | | (10, 10, 300, 300), {'kernel_size': 2, 'return_indices': True} 161 | 0.536546 | 0.485958 | 1.104099531 | | (10, 10, 300, 300), {'kernel_size': 2, 'return_indices': False} 162 | 2.555225 | 1.791567 | 1.426251432 | | (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': True} 163 | 1.419087 | 1.305137 | 1.087308842 | | (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': False} 164 | 5.182008 | 3.48085 | 1.488719135 | | (10, 10, 700, 700), {'kernel_size': 2, 'return_indices': True} 165 | 2.831779 | 2.498537 | 1.133374851 | | (10, 10, 700, 700), {'kernel_size': 2, 'return_indices': False} 166 | 8.546038 | 5.7783 | 1.478988284 | | (10, 10, 900, 900), {'kernel_size': 2, 'return_indices': True} 167 | 4.731004 | 4.161975 | 1.136720908 | | (10, 10, 900, 900), {'kernel_size': 2, 'return_indices': False} 168 | 0.084754 | 0.07435 | 1.139932751 | | (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': True} 169 | 0.057933 | 0.043096 | 1.344277891 | | (10, 10, 100, 100), {'kernel_size': 2, 'return_indices': False} 170 | 2.568592 | 1.802117 | 1.425319222 | | (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': True} 171 | 1.433054 | 1.307342 | 1.096158465 | | (10, 10, 500, 500), {'kernel_size': 2, 'return_indices': False} 172 | 10.3213 | 7.111604 | 1.451332217 | | (10, 10, 1000, 1000), {'kernel_size': 2, 'return_indices': True} 173 | 5.680525 | 5.168129 | 1.099145358 | | (10, 10, 1000, 1000), {'kernel_size': 2, 'return_indices': False} 174 | 1.02255 | 1.01375 | 1.008680641 | | (10, 1000, 1000), {'kernel_size': 2, 'padding': 1, 'stride': 1, 'return_indices': False} 175 | 3.074233 | 3.094383 | 0.993488201 | | (10, 1000, 1000), {'kernel_size': 2, 'padding': 1, 'stride': 1, 'return_indices': True} 176 | 1.016812 | 1.030575 | 0.9866453194 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': False} 177 | 3.053658 | 3.089504 | 0.9883974903 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': 1, 'return_indices': True} 178 | 1.025863 | 1.032088 | 0.9939685376 | | (10, 1000, 1000), {'kernel_size': 8, 'padding': 1, 'stride': 1, 'return_indices': False} 179 | 3.798942 | 3.799213 | 0.9999286694 | | (10, 1000, 1000), {'kernel_size': 8, 'padding': 1, 'stride': 1, 'return_indices': True} 180 | 4.492979 | 4.493421 | 0.999901634 | | (10, 1000, 1000), {'kernel_size': 16, 'padding': 1, 'stride': 1, 'return_indices': False} 181 | 51.543363 | 51.266204 | 1.005406271 | | (10, 1000, 1000), {'kernel_size': 16, 'padding': 1, 'stride': 1, 'return_indices': True} 182 | 1.018008 | 1.001587 | 1.016394981 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 1), 'return_indices': False} 183 | 3.035404 | 3.003113 | 1.010752509 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 1), 'return_indices': True} 184 | 0.610421 | 0.56 | 1.0900375 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 4), 'return_indices': False} 185 | 1.138983 | 0.757296 | 1.504012962 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (1, 4), 'return_indices': True} 186 | 0.641558 | 0.557808 | 1.150141267 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (4, 1), 'return_indices': False} 187 | 1.181475 | 0.754725 | 1.565437742 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 0, 'stride': (4, 1), 'return_indices': True} 188 | 1.03045 | 1.026904 | 1.003453098 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 1), 'return_indices': False} 189 | 3.041421 | 3.0263 | 1.00499653 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 1), 'return_indices': True} 190 | 0.609929 | 0.572304 | 1.065743032 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 4), 'return_indices': False} 191 | 1.146875 | 0.756446 | 1.516135983 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (1, 4), 'return_indices': True} 192 | 0.645187 | 0.561708 | 1.148616363 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (4, 1), 'return_indices': False} 193 | 1.181721 | 0.758054 | 1.558887625 | | (10, 1000, 1000), {'kernel_size': 4, 'padding': 1, 'stride': (4, 1), 'return_indices': True} 194 | 0.927654 | 0.925946 | 1.0018446 | | (10, 1000, 1000), {'kernel_size': 1, 'return_indices': False} 195 | 2.749983 | 2.740354 | 1.00351378 | | (10, 1000, 1000), {'kernel_size': 1, 'return_indices': True} </details> Pull Request resolved: #157876 Approved by: https://github.com/malfet
diff --git a/aten/src/ATen/native/mps/kernels/Pooling.metal b/aten/src/ATen/native/mps/kernels/Pooling.metal
@@ -88,6 +88,53 @@ void max_pool_3d_input_iter(
   }
 }
 
+template <typename T, bool return_indices>
+void max_pool_2d_input_iter(
+    constant T* input,
+    device T* output,
+    device int64_t* indices,
+    constant int32_t* input_sizes,
+    constant int32_t* input_strides,
+    thread int32_t (&pooling_dim_indices)[3],
+    constant int32_t* kernel_size,
+    constant int32_t* stride,
+    constant int32_t* padding,
+    constant int32_t* dilation) {
+  auto bounds0 = get_input_iter_bounds<0>(
+      input_sizes, pooling_dim_indices, kernel_size, stride, padding, dilation);
+  auto bounds1 = get_input_iter_bounds<1>(
+      input_sizes, pooling_dim_indices, kernel_size, stride, padding, dilation);
+
+  auto d0 = dilation[0];
+  auto d1 = dilation[1];
+
+  T max_value = input
+      [input_strides[0] * bounds0.start + input_strides[1] * bounds1.start];
+  auto max_index = bounds0.start * input_sizes[1] + bounds1.start;
+
+  for (auto i0 = bounds0.start; i0 < bounds0.end; i0 += d0) {
+    auto offset0 = input_strides[0] * i0;
+
+    for (auto i1 = bounds1.start; i1 < bounds1.end; i1 += d1) {
+      auto offset1 = input_strides[1] * i1;
+
+      auto input_value = input[offset0 + offset1];
+      bool is_greater = input_value > max_value;
+
+      max_value = is_greater ? input_value : max_value;
+
+      if (return_indices) {
+        auto input_index = i0 * input_sizes[1] + i1;
+        max_index = is_greater ? input_index : max_index;
+      }
+    }
+  }
+  *output = max_value;
+  if (return_indices) {
+    *indices = max_index;
+  }
+}
+
 struct PoolOffsets {
   int32_t output;
   int32_t indices;
@@ -212,7 +259,7 @@ kernel void max_pool(
   PoolOffsets offsets = find_pool_offsets(
       output_sizes,
       output_strides,
-      indices_strides,
+      return_indices ? indices_strides : nullptr,
       input_strides,
       pooling_dim_indices,
       dims,
@@ -224,18 +271,47 @@ kernel void max_pool(
   indices += offsets.indices;
   input += offsets.input_leading;
 
-  max_pool_3d_input_iter<T>(
-      input,
-      output,
-      indices,
-      input_sizes + leading_dims,
-      input_strides + leading_dims,
-      pooling_dim_indices,
-      kernel_size,
-      stride,
-      padding,
-      dilation,
-      return_indices);
+  switch (pooling_dims) {
+    case 2:
+      if (return_indices) {
+        return max_pool_2d_input_iter<T, /*return_indices=*/true>(
+            input,
+            output,
+            indices,
+            input_sizes + leading_dims,
+            input_strides + leading_dims,
+            pooling_dim_indices,
+            kernel_size,
+            stride,
+            padding,
+            dilation);
+      } else {
+        return max_pool_2d_input_iter<T, /*return_indices=*/false>(
+            input,
+            output,
+            indices,
+            input_sizes + leading_dims,
+            input_strides + leading_dims,
+            pooling_dim_indices,
+            kernel_size,
+            stride,
+            padding,
+            dilation);
+      }
+    case 3:
+      return max_pool_3d_input_iter<T>(
+          input,
+          output,
+          indices,
+          input_sizes + leading_dims,
+          input_strides + leading_dims,
+          pooling_dim_indices,
+          kernel_size,
+          stride,
+          padding,
+          dilation,
+          return_indices);
+  }
 }
 
 // Finds the element in the grad input which corresponds to the index into the
diff --git a/aten/src/ATen/native/mps/operations/Pooling.mm b/aten/src/ATen/native/mps/operations/Pooling.mm
@@ -297,13 +297,13 @@ static PoolSizes process_pool_sizes(const Tensor& input,
               pooling_dims,
               " ints");
 
-  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == 3,
+  TORCH_CHECK(stride.empty() || stride.size() == 1 || stride.size() == pooling_dims,
               op_name,
               ": stride must either be omitted, a single int, or a tuple of ",
               pooling_dims,
               " ints");
 
-  TORCH_CHECK(padding.size() == 1 || padding.size() == 3,
+  TORCH_CHECK(padding.size() == 1 || padding.size() == pooling_dims,
               op_name,
               ": padding must either be a single int, or a tuple of ",
               pooling_dims,
@@ -333,6 +333,22 @@ static PoolSizes process_pool_sizes(const Tensor& input,
                 ": pad should be at most half of effective kernel size");
   }
 
+  if (pooling_dims == 2) {
+    const auto memory_format = input.suggest_memory_format();
+    bool valid_dims = input.size(1) != 0 && input.size(2) != 0;
+    if (memory_format == at::MemoryFormat::ChannelsLast) {
+      // Expect tensor in NHWC format and allow 0-dim only for N.
+      TORCH_CHECK((dims == 4 && valid_dims && input.size(3) != 0),
+                  "Expected 4D (batch mode) tensor expected for input with channels_last layout"
+                  " with optional 0 dim batch size for input, but got: ",
+                  input.sizes());
+    } else {
+      TORCH_CHECK((dims == 3 && input.size(0) != 0 && valid_dims) || (dims == 4 && valid_dims && input.size(3) != 0),
+                  "Expected 3D or 4D (batch mode) tensor with optional 0 dim batch size for input, but got:",
+                  input.sizes());
+    }
+  }
+
   for (const auto dim : c10::irange(static_cast<int>(leading_dims == 2), dims)) {
     TORCH_CHECK(input.size(dim) > 0, op_name, ": Expected input's non-batch dimensions to have positive length");
   }
@@ -786,31 +802,54 @@ static void avg_pool_backward_out_mps_template(const Tensor& grad_input,
 
 } // namespace mps
 
+// TODO: The MPS graph impl can sometimes give significantly better performance
+// than the Metal impl for cases where the stride is 1 in all dimensions. There
+// may be a code path in the graph kernel that specifically optimizes for that
+// case. We should look into implementing a specialized case in Metal so we can
+// avoid using the graph impl.
+static bool use_graph_for_max_pool2d(IntArrayRef kernel_size, IntArrayRef stride_) {
+  IntArrayRef stride = stride_.empty() ? kernel_size : stride_;
+  return (stride[0] == 1) && (stride.size() == 1 || stride[1] == 1);
+}
+
 Tensor mps_max_pool2d(const Tensor& input,
                       IntArrayRef kernel_size,
                       IntArrayRef stride,
                       IntArrayRef padding,
                       IntArrayRef dilation,
                       bool ceil_mode) {
   Tensor output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
-  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
-    MPSGraph* mpsGraph = cachedGraph.graph();
-    return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
-  };
-  mps::pool2d_template(input,
-                       output,
-                       std::nullopt,
-                       std::nullopt,
-                       kernel_size,
-                       stride,
-                       padding,
-                       dilation,
-                       ceil_mode,
-                       false,
-                       std::nullopt,
-                       pooling_op_block,
-                       "max_pool2d");
-
+  bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
+  if (use_graph) {
+    mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+      MPSGraph* mpsGraph = cachedGraph.graph();
+      return [mpsGraph maxPooling2DWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
+    };
+    mps::pool2d_template(input,
+                         output,
+                         std::nullopt,
+                         std::nullopt,
+                         kernel_size,
+                         stride,
+                         padding,
+                         dilation,
+                         ceil_mode,
+                         false,
+                         std::nullopt,
+                         pooling_op_block,
+                         "max_pool2d");
+  } else {
+    mps::max_pool_with_indices_out_mps_template(output,
+                                                std::nullopt,
+                                                input,
+                                                kernel_size,
+                                                stride,
+                                                padding,
+                                                dilation,
+                                                ceil_mode,
+                                                /*pooling_dims=*/2,
+                                                "max_pool2d");
+  }
   return output;
 }
 
@@ -855,32 +894,45 @@ Tensor mps_max_pool2d_backward(const Tensor& grad_output,
  bool ceil_mode,
  const Tensor& output,
  const Tensor& indices) {
-  auto indices_memory_format = indices.suggest_memory_format();
-
-  mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
-    MPSGraph* mpsGraph = cachedGraph.graph();
-    NSArray<MPSGraphTensor*>* poolOutputs = [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor
-                                                                                     descriptor:desc
-                                                                                           name:nil];
-    cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long);
-    return poolOutputs[0];
-  };
-  mps::pool2d_template(input,
-                       output,
-                       indices,
-                       std::nullopt,
-                       kernel_size,
-                       stride,
-                       padding,
-                       dilation,
-                       ceil_mode,
-                       false,
-                       std::nullopt,
-                       pooling_op_block,
-                       "max_pool2d_indices");
+  bool use_graph = use_graph_for_max_pool2d(kernel_size, stride);
+  if (use_graph) {
+    auto indices_memory_format = indices.suggest_memory_format();
+
+    mps::PoolingOpBlock pooling_op_block = ^PoolingOpFn(cachedGraph, desc) {
+      MPSGraph* mpsGraph = cachedGraph.graph();
+      NSArray<MPSGraphTensor*>* poolOutputs =
+          [mpsGraph maxPooling2DReturnIndicesWithSourceTensor:cachedGraph.inputTensor descriptor:desc name:nil];
+      cachedGraph.indicesTensor = mps::castMPSTensor(mpsGraph, poolOutputs[1], ScalarType::Long);
+      return poolOutputs[0];
+    };
+    mps::pool2d_template(input,
+                         output,
+                         indices,
+                         std::nullopt,
+                         kernel_size,
+                         stride,
+                         padding,
+                         dilation,
+                         ceil_mode,
+                         false,
+                         std::nullopt,
+                         pooling_op_block,
+                         "max_pool2d_indices");
+    if (indices_memory_format == MemoryFormat::ChannelsLast) {
+      const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+    }
 
-  if (indices_memory_format == MemoryFormat::ChannelsLast) {
-    const_cast<Tensor&>(indices) = indices.to(MemoryFormat::ChannelsLast);
+  } else {
+    mps::max_pool_with_indices_out_mps_template(output,
+                                                indices,
+                                                input,
+                                                kernel_size,
+                                                stride,
+                                                padding,
+                                                dilation,
+                                                ceil_mode,
+                                                /*pooling_dims=*/2,
+                                                "max_pool2d");
   }
 }