diff --git a/numpy/fft/_pocketfft.py b/numpy/fft/_pocketfft.py
index a8ed59e46e86..1ef37b3c01a3 100644
--- a/numpy/fft/_pocketfft.py
+++ b/numpy/fft/_pocketfft.py
@@ -34,8 +34,8 @@
 import warnings
 
 from numpy.lib.array_utils import normalize_axis_index
-from numpy._core import asarray, zeros, swapaxes, conjugate, take, sqrt
-from . import _pocketfft_internal as pfi
+from numpy._core import asarray, empty, zeros, swapaxes, conjugate, take, sqrt
+from . import _pocketfft_umath as pfu
 from numpy._core import overrides
 
 
@@ -47,33 +47,36 @@
 # divided. This replaces the original, more intuitive 'fct` parameter to avoid
 # divisions by zero (or alternatively additional checks) in the case of
 # zero-length axes during its computation.
-def _raw_fft(a, n, axis, is_real, is_forward, inv_norm):
+def _raw_fft(a, n, axis, is_real, is_forward, inv_norm, out=None):
     axis = normalize_axis_index(axis, a.ndim)
     if n is None:
         n = a.shape[axis]
 
     fct = 1/inv_norm
 
-    if a.shape[axis] != n:
-        s = list(a.shape)
-        index = [slice(None)]*len(s)
-        if s[axis] > n:
-            index[axis] = slice(0, n)
-            a = a[tuple(index)]
+    n_out = n
+    if is_real:
+        if is_forward:
+            ufunc = pfu.rfft_n_even if n % 2 == 0 else pfu.rfft_n_odd
+            n_out = n // 2 + 1
         else:
-            index[axis] = slice(0, s[axis])
-            s[axis] = n
-            z = zeros(s, a.dtype.char)
-            z[tuple(index)] = a
-            a = z
-
-    if axis == a.ndim-1:
-        r = pfi.execute(a, is_real, is_forward, fct)
+            ufunc = pfu.irfft
     else:
-        a = swapaxes(a, axis, -1)
-        r = pfi.execute(a, is_real, is_forward, fct)
-        r = swapaxes(r, axis, -1)
-    return r
+        ufunc = pfu.fft if is_forward else pfu.ifft
+
+    if out is None:
+        out = empty(a.shape[:axis] + (n_out,) + a.shape[axis+1:],
+                    dtype=complex if is_forward or not is_real else float)
+    elif ((shape := getattr(out, "shape", None)) is not None
+          and (len(shape) != a.ndim or shape[axis] != n_out)):
+        raise ValueError("output array has wrong shape.")
+    # Note: for backward compatibility, we want to accept longdouble as well,
+    # even though it is at reduced precision. To tell the promotor that we
+    # want to do that, we set the signature (to the only the ufunc has).
+    # Then, the default casting='same_kind' will take care of the rest.
+    # TODO: create separate float, double, and longdouble loops.
+    return ufunc(a, fct, axes=[(axis,), (), (axis,)], out=out,
+                 signature=ufunc.types[0])
 
 
 def _get_forward_norm(n, norm):
@@ -116,12 +119,12 @@ def _swap_direction(norm):
                          '"ortho" or "forward".') from None
 
 
-def _fft_dispatcher(a, n=None, axis=None, norm=None):
-    return (a,)
+def _fft_dispatcher(a, n=None, axis=None, norm=None, out=None):
+    return (a, out)
 
 
 @array_function_dispatch(_fft_dispatcher)
-def fft(a, n=None, axis=-1, norm=None):
+def fft(a, n=None, axis=-1, norm=None, out=None):
     """
     Compute the one-dimensional discrete Fourier Transform.
 
@@ -151,6 +154,11 @@ def fft(a, n=None, axis=-1, norm=None):
         .. versionadded:: 1.20.0
 
             The "backward", "forward" values were added.
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype.
+
+        .. versionadded:: 2.0.0
 
     Returns
     -------
@@ -213,12 +221,12 @@ def fft(a, n=None, axis=-1, norm=None):
     if n is None:
         n = a.shape[axis]
     inv_norm = _get_forward_norm(n, norm)
-    output = _raw_fft(a, n, axis, False, True, inv_norm)
+    output = _raw_fft(a, n, axis, False, True, inv_norm, out)
     return output
 
 
 @array_function_dispatch(_fft_dispatcher)
-def ifft(a, n=None, axis=-1, norm=None):
+def ifft(a, n=None, axis=-1, norm=None, out=None):
     """
     Compute the one-dimensional inverse discrete Fourier Transform.
 
@@ -264,6 +272,12 @@ def ifft(a, n=None, axis=-1, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype.
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : complex ndarray
@@ -314,12 +328,12 @@ def ifft(a, n=None, axis=-1, norm=None):
     if n is None:
         n = a.shape[axis]
     inv_norm = _get_backward_norm(n, norm)
-    output = _raw_fft(a, n, axis, False, False, inv_norm)
+    output = _raw_fft(a, n, axis, False, False, inv_norm, out=out)
     return output
 
 
 @array_function_dispatch(_fft_dispatcher)
-def rfft(a, n=None, axis=-1, norm=None):
+def rfft(a, n=None, axis=-1, norm=None, out=None):
     """
     Compute the one-dimensional discrete Fourier Transform for real input.
 
@@ -350,6 +364,12 @@ def rfft(a, n=None, axis=-1, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype.
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : complex ndarray
@@ -407,12 +427,12 @@ def rfft(a, n=None, axis=-1, norm=None):
     if n is None:
         n = a.shape[axis]
     inv_norm = _get_forward_norm(n, norm)
-    output = _raw_fft(a, n, axis, True, True, inv_norm)
+    output = _raw_fft(a, n, axis, True, True, inv_norm, out=out)
     return output
 
 
 @array_function_dispatch(_fft_dispatcher)
-def irfft(a, n=None, axis=-1, norm=None):
+def irfft(a, n=None, axis=-1, norm=None, out=None):
     """
     Computes the inverse of `rfft`.
 
@@ -452,6 +472,12 @@ def irfft(a, n=None, axis=-1, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype.
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : ndarray
@@ -511,12 +537,12 @@ def irfft(a, n=None, axis=-1, norm=None):
     if n is None:
         n = (a.shape[axis] - 1) * 2
     inv_norm = _get_backward_norm(n, norm)
-    output = _raw_fft(a, n, axis, True, False, inv_norm)
+    output = _raw_fft(a, n, axis, True, False, inv_norm, out=out)
     return output
 
 
 @array_function_dispatch(_fft_dispatcher)
-def hfft(a, n=None, axis=-1, norm=None):
+def hfft(a, n=None, axis=-1, norm=None, out=None):
     """
     Compute the FFT of a signal that has Hermitian symmetry, i.e., a real
     spectrum.
@@ -546,6 +572,12 @@ def hfft(a, n=None, axis=-1, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype.
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : ndarray
@@ -609,12 +641,12 @@ def hfft(a, n=None, axis=-1, norm=None):
     if n is None:
         n = (a.shape[axis] - 1) * 2
     new_norm = _swap_direction(norm)
-    output = irfft(conjugate(a), n, axis, norm=new_norm)
+    output = irfft(conjugate(a), n, axis, norm=new_norm, out=None)
     return output
 
 
 @array_function_dispatch(_fft_dispatcher)
-def ihfft(a, n=None, axis=-1, norm=None):
+def ihfft(a, n=None, axis=-1, norm=None, out=None):
     """
     Compute the inverse FFT of a signal that has Hermitian symmetry.
 
@@ -642,6 +674,12 @@ def ihfft(a, n=None, axis=-1, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype.
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : complex ndarray
@@ -676,8 +714,8 @@ def ihfft(a, n=None, axis=-1, norm=None):
     if n is None:
         n = a.shape[axis]
     new_norm = _swap_direction(norm)
-    output = conjugate(rfft(a, n, axis, norm=new_norm))
-    return output
+    out = rfft(a, n, axis, norm=new_norm, out=out)
+    return conjugate(out, out=out)
 
 
 def _cook_nd_args(a, s=None, axes=None, invreal=0):
@@ -717,22 +755,22 @@ def _cook_nd_args(a, s=None, axes=None, invreal=0):
     return s, axes
 
 
-def _raw_fftnd(a, s=None, axes=None, function=fft, norm=None):
+def _raw_fftnd(a, s=None, axes=None, function=fft, norm=None, out=None):
     a = asarray(a)
     s, axes = _cook_nd_args(a, s, axes)
     itl = list(range(len(axes)))
     itl.reverse()
     for ii in itl:
-        a = function(a, n=s[ii], axis=axes[ii], norm=norm)
+        a = function(a, n=s[ii], axis=axes[ii], norm=norm, out=out)
     return a
 
 
-def _fftn_dispatcher(a, s=None, axes=None, norm=None):
-    return (a,)
+def _fftn_dispatcher(a, s=None, axes=None, norm=None, out=None):
+    return (a, out)
 
 
 @array_function_dispatch(_fftn_dispatcher)
-def fftn(a, s=None, axes=None, norm=None):
+def fftn(a, s=None, axes=None, norm=None, out=None):
     """
     Compute the N-dimensional discrete Fourier Transform.
 
@@ -791,6 +829,13 @@ def fftn(a, s=None, axes=None, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype for all axes (and hence is
+        imcompatible with passing in all but the trivial ``s``).
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : complex ndarray
@@ -854,11 +899,11 @@ def fftn(a, s=None, axes=None, norm=None):
     >>> plt.show()
 
     """
-    return _raw_fftnd(a, s, axes, fft, norm)
+    return _raw_fftnd(a, s, axes, fft, norm, out=out)
 
 
 @array_function_dispatch(_fftn_dispatcher)
-def ifftn(a, s=None, axes=None, norm=None):
+def ifftn(a, s=None, axes=None, norm=None, out=None):
     """
     Compute the N-dimensional inverse discrete Fourier Transform.
 
@@ -926,6 +971,13 @@ def ifftn(a, s=None, axes=None, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype for all axes (and hence is
+        imcompatible with passing in all but the trivial ``s``).
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : complex ndarray
@@ -980,11 +1032,11 @@ def ifftn(a, s=None, axes=None, norm=None):
     >>> plt.show()
 
     """
-    return _raw_fftnd(a, s, axes, ifft, norm)
+    return _raw_fftnd(a, s, axes, ifft, norm, out=out)
 
 
 @array_function_dispatch(_fftn_dispatcher)
-def fft2(a, s=None, axes=(-2, -1), norm=None):
+def fft2(a, s=None, axes=(-2, -1), norm=None, out=None):
     """
     Compute the 2-dimensional discrete Fourier Transform.
 
@@ -1044,6 +1096,13 @@ def fft2(a, s=None, axes=(-2, -1), norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype for all axes (and hence only the
+        last axis can have ``s`` not equal to the shape at that axis).
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : complex ndarray
@@ -1099,11 +1158,11 @@ def fft2(a, s=None, axes=(-2, -1), norm=None):
               0.  +0.j        ,   0.  +0.j        ]])
 
     """
-    return _raw_fftnd(a, s, axes, fft, norm)
+    return _raw_fftnd(a, s, axes, fft, norm, out=out)
 
 
 @array_function_dispatch(_fftn_dispatcher)
-def ifft2(a, s=None, axes=(-2, -1), norm=None):
+def ifft2(a, s=None, axes=(-2, -1), norm=None, out=None):
     """
     Compute the 2-dimensional inverse discrete Fourier Transform.
 
@@ -1170,6 +1229,13 @@ def ifft2(a, s=None, axes=(-2, -1), norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype for all axes (and hence is
+        imcompatible with passing in all but the trivial ``s``).
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : complex ndarray
@@ -1215,11 +1281,11 @@ def ifft2(a, s=None, axes=(-2, -1), norm=None):
            [0.+0.j,  1.+0.j,  0.+0.j,  0.+0.j]])
 
     """
-    return _raw_fftnd(a, s, axes, ifft, norm)
+    return _raw_fftnd(a, s, axes, ifft, norm, out=None)
 
 
 @array_function_dispatch(_fftn_dispatcher)
-def rfftn(a, s=None, axes=None, norm=None):
+def rfftn(a, s=None, axes=None, norm=None, out=None):
     """
     Compute the N-dimensional discrete Fourier Transform for real input.
 
@@ -1279,6 +1345,13 @@ def rfftn(a, s=None, axes=None, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype for all axes (and hence is
+        imcompatible with passing in all but the trivial ``s``).
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : complex ndarray
@@ -1333,14 +1406,14 @@ def rfftn(a, s=None, axes=None, norm=None):
     """
     a = asarray(a)
     s, axes = _cook_nd_args(a, s, axes)
-    a = rfft(a, s[-1], axes[-1], norm)
+    a = rfft(a, s[-1], axes[-1], norm, out=out)
     for ii in range(len(axes)-1):
-        a = fft(a, s[ii], axes[ii], norm)
+        a = fft(a, s[ii], axes[ii], norm, out=out)
     return a
 
 
 @array_function_dispatch(_fftn_dispatcher)
-def rfft2(a, s=None, axes=(-2, -1), norm=None):
+def rfft2(a, s=None, axes=(-2, -1), norm=None, out=None):
     """
     Compute the 2-dimensional FFT of a real array.
 
@@ -1385,6 +1458,13 @@ def rfft2(a, s=None, axes=(-2, -1), norm=None):
 
             The "backward", "forward" values were added.
 
+    out : complex ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype for the last inverse transform.
+        imcompatible with passing in all but the trivial ``s``).
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : ndarray
@@ -1410,11 +1490,11 @@ def rfft2(a, s=None, axes=(-2, -1), norm=None):
            [-12.5 -4.0614962j ,   0.  +0.j        ,   0.  +0.j        ],
            [-12.5-17.20477401j,   0.  +0.j        ,   0.  +0.j        ]])
     """
-    return rfftn(a, s, axes, norm)
+    return rfftn(a, s, axes, norm, out=out)
 
 
 @array_function_dispatch(_fftn_dispatcher)
-def irfftn(a, s=None, axes=None, norm=None):
+def irfftn(a, s=None, axes=None, norm=None, out=None):
     """
     Computes the inverse of `rfftn`.
 
@@ -1483,6 +1563,12 @@ def irfftn(a, s=None, axes=None, norm=None):
 
             The "backward", "forward" values were added.
 
+    out : ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype for the last transformation.
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : ndarray
@@ -1543,12 +1629,12 @@ def irfftn(a, s=None, axes=None, norm=None):
     s, axes = _cook_nd_args(a, s, axes, invreal=1)
     for ii in range(len(axes)-1):
         a = ifft(a, s[ii], axes[ii], norm)
-    a = irfft(a, s[-1], axes[-1], norm)
+    a = irfft(a, s[-1], axes[-1], norm, out=out)
     return a
 
 
 @array_function_dispatch(_fftn_dispatcher)
-def irfft2(a, s=None, axes=(-2, -1), norm=None):
+def irfft2(a, s=None, axes=(-2, -1), norm=None, out=None):
     """
     Computes the inverse of `rfft2`.
 
@@ -1594,6 +1680,12 @@ def irfft2(a, s=None, axes=(-2, -1), norm=None):
 
             The "backward", "forward" values were added.
 
+    out : ndarray, optional
+        If provided, the result will be placed in this array. It should be
+        of the appropriate shape and dtype for the last transformation.
+
+        .. versionadded:: 2.0.0
+
     Returns
     -------
     out : ndarray
@@ -1623,4 +1715,4 @@ def irfft2(a, s=None, axes=(-2, -1), norm=None):
            [3., 3., 3., 3., 3.],
            [4., 4., 4., 4., 4.]])
     """
-    return irfftn(a, s, axes, norm)
+    return irfftn(a, s, axes, norm, out=None)
diff --git a/numpy/fft/_pocketfft.pyi b/numpy/fft/_pocketfft.pyi
index 2bd8b0ba34af..7f088572efe8 100644
--- a/numpy/fft/_pocketfft.pyi
+++ b/numpy/fft/_pocketfft.pyi
@@ -13,6 +13,7 @@ def fft(
     n: None | int = ...,
     axis: int = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def ifft(
@@ -20,6 +21,7 @@ def ifft(
     n: None | int = ...,
     axis: int = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def rfft(
@@ -27,6 +29,7 @@ def rfft(
     n: None | int = ...,
     axis: int = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def irfft(
@@ -34,6 +37,7 @@ def irfft(
     n: None | int = ...,
     axis: int = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[float64] = ...,
 ) -> NDArray[float64]: ...
 
 # Input array must be compatible with `np.conjugate`
@@ -42,6 +46,7 @@ def hfft(
     n: None | int = ...,
     axis: int = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[float64] = ...,
 ) -> NDArray[float64]: ...
 
 def ihfft(
@@ -49,6 +54,7 @@ def ihfft(
     n: None | int = ...,
     axis: int = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def fftn(
@@ -56,6 +62,7 @@ def fftn(
     s: None | Sequence[int] = ...,
     axes: None | Sequence[int] = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def ifftn(
@@ -63,6 +70,7 @@ def ifftn(
     s: None | Sequence[int] = ...,
     axes: None | Sequence[int] = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def rfftn(
@@ -70,6 +78,7 @@ def rfftn(
     s: None | Sequence[int] = ...,
     axes: None | Sequence[int] = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def irfftn(
@@ -77,6 +86,7 @@ def irfftn(
     s: None | Sequence[int] = ...,
     axes: None | Sequence[int] = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[float64] = ...,
 ) -> NDArray[float64]: ...
 
 def fft2(
@@ -84,6 +94,7 @@ def fft2(
     s: None | Sequence[int] = ...,
     axes: None | Sequence[int] = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def ifft2(
@@ -91,6 +102,7 @@ def ifft2(
     s: None | Sequence[int] = ...,
     axes: None | Sequence[int] = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def rfft2(
@@ -98,6 +110,7 @@ def rfft2(
     s: None | Sequence[int] = ...,
     axes: None | Sequence[int] = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[complex128] = ...,
 ) -> NDArray[complex128]: ...
 
 def irfft2(
@@ -105,4 +118,5 @@ def irfft2(
     s: None | Sequence[int] = ...,
     axes: None | Sequence[int] = ...,
     norm: _NormKind = ...,
+    out: None | NDArray[float64] = ...,
 ) -> NDArray[float64]: ...
diff --git a/numpy/fft/_pocketfft_umath.c b/numpy/fft/_pocketfft_umath.c
new file mode 100644
index 000000000000..f33c973a0a98
--- /dev/null
+++ b/numpy/fft/_pocketfft_umath.c
@@ -0,0 +1,358 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*
+ *  Main implementation file.
+ *
+ *  Copyright (C) 2004-2018 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#define PY_SSIZE_T_CLEAN
+#include <assert.h>
+#include <Python.h>
+
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+
+#include "npy_config.h"
+
+#include "pocketfft/pocketfft.h"
+
+/*
+ * Copy all nin elements of input to the first nin of the output,
+ * and any set any remaining nout-nin output elements to 0
+ * (if nout < nin, copy only nout).
+ */
+
+static inline void
+copy_data(char* in, npy_intp step_in, npy_intp nin,
+          char* out, npy_intp step_out, npy_intp nout, npy_intp elsize)
+{
+    npy_intp ncopy = nin <= nout? nin : nout;
+    if (step_in == elsize && step_out == elsize) {
+        memcpy(out, in, ncopy*elsize);
+    }
+    else {
+        char *ip = in, *op = out;
+        for (npy_intp i = 0; i < ncopy; i++, ip += step_in, op += step_out) {
+            memcpy(op, ip, elsize);
+        }
+    }
+    if (nin < nout) {
+        char *op = out + nin*elsize;
+        if (step_out == elsize) {
+            memset(op, 0, (nout-nin)*elsize);
+        }
+        else {
+            for (npy_intp i = nin; i < nout; i++, op += step_out) {
+                memset(op, 0, elsize);
+            }
+        }
+    }
+}
+
+
+/*
+ * Loops calling the pocketfft code.
+ *
+ * Unfortunately, the gufunc machinery does not (yet?) allow forcing contiguous
+ * inner loop data, so we create a contiguous output buffer if needed
+ * (input gets copied to output before processing, so can be non-contiguous).
+ */
+static void
+fft_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    char *ip = args[0], *fp = args[1], *op = args[2];
+    npy_intp n_outer = dimensions[0];
+    npy_intp si = steps[0], sf = steps[1], so = steps[2];
+    npy_intp nin = dimensions[1], nout = dimensions[2];
+    npy_intp step_in = steps[3], step_out = steps[4];
+    int (*cfft_function)(cfft_plan, double *, double) = func;
+    npy_intp npts = nout;
+    cfft_plan plan;
+    char *buff = NULL;
+    int no_mem = 1;
+
+    if (nout == 0) {
+        return;  /* no output to set */
+    }
+
+    plan = make_cfft_plan(npts);
+    if (plan == NULL) {
+        goto fail;
+    }
+    if (step_out != sizeof(npy_cdouble)) {
+        buff = malloc(npts * sizeof(npy_cdouble));
+        if (buff == NULL) {
+            goto fail;
+        }
+    }
+
+    for (npy_intp i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
+        double fct = *(double *)fp;
+        char *op_or_buff = buff == NULL ? op : buff;
+        if (ip != op_or_buff) {  /* no copy needed if in-place already */
+            copy_data(ip, step_in, nin,
+                      op_or_buff, sizeof(npy_cdouble), npts, sizeof(npy_cdouble));
+        }
+        if ((no_mem = cfft_function(plan, (double *)op_or_buff, fct)) != 0) {
+            break;
+        }
+        if (op_or_buff == buff) {
+            copy_data(op_or_buff, sizeof(npy_cdouble), npts,
+                      op, step_out, npts, sizeof(npy_cdouble));
+        }
+    }
+  fail:
+    free(buff);
+    destroy_cfft_plan(plan);  /* uses free so can be passed NULL */
+    if (no_mem) {
+        /* TODO: Requires use of new ufunc API to indicate error return */
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API;
+        PyErr_NoMemory();
+        NPY_DISABLE_C_API;
+    }
+    return;
+}
+
+
+
+static void
+rfft_impl(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func,
+    npy_intp npts)
+{
+    char *ip = args[0], *fp = args[1], *op = args[2];
+    npy_intp n_outer = dimensions[0];
+    npy_intp si = steps[0], sf = steps[1], so = steps[2];
+    npy_intp nin = dimensions[1], nout = dimensions[2];
+    npy_intp step_in = steps[3], step_out = steps[4];
+    rfft_plan plan;
+    char *buff = NULL;
+    int no_mem = 1;
+
+    if (nout == 0) {
+        return;
+    }
+
+    plan = make_rfft_plan(npts);
+    if (plan == NULL) {
+        goto fail;
+    }
+    if (step_out != sizeof(npy_cdouble)){
+        buff = malloc(nout * sizeof(npy_cdouble));
+        if (buff == NULL) {
+            goto fail;
+        }
+    }
+
+    for (npy_intp i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
+        double fct = *(double *)fp;
+        char *op_or_buff = buff == NULL ? op : buff;
+        double *op_double = (double *)op_or_buff;
+        /* Copy dat to buffer in starting at position 1, as expected for FFTpack order */
+        copy_data(ip, step_in, nin,
+                  (char *)&op_double[1], sizeof(npy_double), nout*2 - 1, sizeof(npy_double));
+        if ((no_mem = rfft_forward(plan, &op_double[1], fct)) != 0) {
+            break;
+        }
+        op_double[0] = op_double[1];
+        op_double[1] = 0.;
+        if (op_or_buff == buff) {
+            copy_data(op_or_buff, sizeof(npy_cdouble), nout,
+                      op, step_out, nout, sizeof(npy_cdouble));
+        }
+    }
+  fail:
+    free(buff);
+    destroy_rfft_plan(plan);
+    if (no_mem) {
+        /* TODO: Requires use of new ufunc API to indicate error return */
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API;
+        PyErr_NoMemory();
+        NPY_DISABLE_C_API;
+    }
+    return;
+}
+
+/*
+ * For the forward real, we cannot know what the requested number of points is
+ * just based on the number of points in the complex output array (e.g., 10
+ * and 11 real input points both lead to 6 complex output points), so we
+ * define versions for both even and odd number of points.
+ */
+static void
+rfft_n_even_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    npy_intp npts = 2 * dimensions[2] - 2;
+    rfft_impl(args, dimensions, steps, func, npts);
+}
+
+static void
+rfft_n_odd_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    npy_intp npts = 2 * dimensions[2] - 1;
+    rfft_impl(args, dimensions, steps, func, npts);
+}
+
+
+static void
+irfft_loop(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    char *ip = args[0], *fp = args[1], *op = args[2];
+    npy_intp n_outer = dimensions[0];
+    npy_intp si = steps[0], sf = steps[1], so = steps[2];
+    npy_intp nin = dimensions[1], nout = dimensions[2];
+    npy_intp step_in = steps[3], step_out = steps[4];
+    npy_intp npts = nout;
+    rfft_plan plan;
+    char *buff = NULL;
+    int no_mem = 1;
+
+    if (nout == 0) {
+        return;
+    }
+
+    plan = make_rfft_plan(npts);
+    if (plan == NULL) {
+        goto fail;
+    }
+    if (step_out != sizeof(npy_double)) {
+        buff = malloc(npts * sizeof(npy_double));
+        if (buff == NULL) {
+            goto fail;
+        }
+    }
+
+    for (npy_intp i = 0; i < n_outer; i++, ip += si, fp += sf, op += so) {
+        double fct = *(double *)fp;
+        char *op_or_buff = buff == NULL ? op : buff;
+        double *op_double = (double *)op_or_buff;
+        /* Copy complex input to buffer in FFTpack order */
+        op_double[0] = ((double *)ip)[0];
+        if (nin > 1) {
+            copy_data(ip + step_in, step_in, nin - 2,
+                      (char *)&op_double[1], sizeof(npy_cdouble), npts / 2 - 1,
+                      sizeof(npy_cdouble));
+            npy_intp ncopied = (npts / 2 - 1  < nin - 2 ? npts / 2 - 1 : nin - 2);
+            double *last = (double *)(ip + (ncopied + 1) * step_in);
+            op_double[ncopied * 2 + 1] = last[0];
+            if (npts % 2 == 1) {  /* For odd n, we still imag real of the last point */
+                op_double[ncopied * 2 + 2] = last[1];
+            }
+        }
+        if ((no_mem = rfft_backward(plan, op_double, fct)) != 0) {
+            break;
+        }
+        if (op_or_buff == buff) {
+            copy_data(op_or_buff, sizeof(npy_double), npts,
+                      op, step_out, npts, sizeof(npy_double));
+        }
+    }
+  fail:
+    free(buff);
+    destroy_rfft_plan(plan);
+    if (no_mem) {
+        /* TODO: Requires use of new ufunc API to indicate error return */
+        NPY_ALLOW_C_API_DEF
+        NPY_ALLOW_C_API;
+        PyErr_NoMemory();
+        NPY_DISABLE_C_API;
+    }
+    return;
+}
+
+
+static PyUFuncGenericFunction fft_functions[] = { fft_loop };
+static char fft_types[] = { NPY_CDOUBLE, NPY_DOUBLE, NPY_CDOUBLE};
+static void *fft_data[] = { &cfft_forward };
+static void *ifft_data[] = { &cfft_backward };
+
+static PyUFuncGenericFunction rfft_n_even_functions[] = { rfft_n_even_loop };
+static PyUFuncGenericFunction rfft_n_odd_functions[] = { rfft_n_odd_loop };
+static char rfft_types[] = { NPY_DOUBLE, NPY_DOUBLE, NPY_CDOUBLE};
+static void *rfft_data[] = { (void *)NULL };
+
+static PyUFuncGenericFunction irfft_functions[] = { irfft_loop };
+static char irfft_types[] = { NPY_CDOUBLE, NPY_DOUBLE, NPY_DOUBLE};
+static void *irfft_data[] = { (void *)NULL };
+
+static int
+add_gufuncs(PyObject *dictionary) {
+    PyObject *f;
+
+    f = PyUFunc_FromFuncAndDataAndSignature(
+        fft_functions, fft_data, fft_types, 1, 2, 1, PyUFunc_None,
+        "fft", "complex forward FFT\n", 0, "(n),()->(m)");
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "fft", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(
+        fft_functions, ifft_data, fft_types, 1, 2, 1, PyUFunc_None,
+        "ifft", "complex backward FFT\n", 0, "(m),()->(n)");
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "ifft", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(
+        rfft_n_even_functions, rfft_data, rfft_types, 1, 2, 1, PyUFunc_None,
+        "rfft_n_even", "real forward FFT for even n\n", 0, "(n),()->(m)");
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "rfft_n_even", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(
+        rfft_n_odd_functions, rfft_data, rfft_types, 1, 2, 1, PyUFunc_None,
+        "rfft_n_odd", "real forward FFT for odd n\n", 0, "(n),()->(m)");
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "rfft_n_odd", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(
+        irfft_functions, irfft_data, irfft_types, 1, 2, 1, PyUFunc_None,
+        "irfft", "real backward FFT\n", 0, "(m),()->(n)");
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "irfft", f);
+    Py_DECREF(f);
+    return 0;
+}
+
+static struct PyModuleDef moduledef = {
+    PyModuleDef_HEAD_INIT,
+    .m_name = "_umath_pocketfft",
+    .m_size = -1,
+};
+
+/* Initialization function for the module */
+PyMODINIT_FUNC PyInit__pocketfft_umath(void)
+{
+    PyObject *m = PyModule_Create(&moduledef);
+    if (m == NULL) {
+        return NULL;
+    }
+
+    /* Import the array and ufunc objects */
+    import_array();
+    import_ufunc();
+
+    PyObject *d = PyModule_GetDict(m);
+    if (add_gufuncs(d) < 0) {
+        Py_DECREF(d);
+        Py_DECREF(m);
+        return NULL;
+    }
+
+    return m;
+}
diff --git a/numpy/fft/meson.build b/numpy/fft/meson.build
index 321a6d10862e..e2d4e00c05c5 100644
--- a/numpy/fft/meson.build
+++ b/numpy/fft/meson.build
@@ -3,8 +3,8 @@ if host_machine.system() == 'aix' or host_machine.system() == 'AIX'
   largefile_define += '-D_LARGE_FILES'
 endif
 
-py.extension_module('_pocketfft_internal',
-  '_pocketfft.c',
+py.extension_module('_pocketfft_umath',
+  ['_pocketfft_umath.c', 'pocketfft/pocketfft.c'],
   c_args: largefile_define,
   dependencies: np_core_dep,
   install: true,
diff --git a/numpy/fft/README.md b/numpy/fft/pocketfft/README.md
similarity index 100%
rename from numpy/fft/README.md
rename to numpy/fft/pocketfft/README.md
diff --git a/numpy/fft/_pocketfft.c b/numpy/fft/pocketfft/pocketfft.c
similarity index 91%
rename from numpy/fft/_pocketfft.c
rename to numpy/fft/pocketfft/pocketfft.c
index 37649386fda3..669ebacc1bbc 100644
--- a/numpy/fft/_pocketfft.c
+++ b/numpy/fft/pocketfft/pocketfft.c
@@ -9,18 +9,12 @@
  *  Copyright (C) 2004-2018 Max-Planck-Society
  *  \author Martin Reinecke
  */
-#define NPY_NO_DEPRECATED_API NPY_API_VERSION
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-
-#include "numpy/arrayobject.h"
-
-#include "npy_config.h"
 
 #include <math.h>
 #include <string.h>
-#include <stdlib.h>
+#include <assert.h>
+
+#include "pocketfft.h"
 
 #define RALLOC(type,num) \
   (assert(num != 0), ((type *)malloc((num)*sizeof(type))))
@@ -38,11 +32,6 @@
 #define WARN_UNUSED_RESULT
 #endif
 
-struct cfft_plan_i;
-typedef struct cfft_plan_i * cfft_plan;
-struct rfft_plan_i;
-typedef struct rfft_plan_i * rfft_plan;
-
 // adapted from https://stackoverflow.com/questions/42792939/
 // CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
 static void my_sincosm1pi (double a, double *restrict res)
@@ -2078,7 +2067,7 @@ typedef struct cfft_plan_i
   fftblue_plan blueplan;
   } cfft_plan_i;
 
-static cfft_plan make_cfft_plan (size_t length)
+cfft_plan make_cfft_plan (size_t length)
   {
   if (length==0) return NULL;
   cfft_plan plan = RALLOC(cfft_plan_i,1);
@@ -2107,7 +2096,7 @@ static cfft_plan make_cfft_plan (size_t length)
   return plan;
   }
 
-static void destroy_cfft_plan (cfft_plan plan)
+void destroy_cfft_plan (cfft_plan plan)
   {
   if (plan->blueplan)
     destroy_fftblue_plan(plan->blueplan);
@@ -2116,7 +2105,7 @@ static void destroy_cfft_plan (cfft_plan plan)
   DEALLOC(plan);
   }
 
-WARN_UNUSED_RESULT static int cfft_backward(cfft_plan plan, double c[], double fct)
+WARN_UNUSED_RESULT int cfft_backward(cfft_plan plan, double c[], double fct)
   {
   if (plan->packplan)
     return cfftp_backward(plan->packplan,c,fct);
@@ -2124,7 +2113,7 @@ WARN_UNUSED_RESULT static int cfft_backward(cfft_plan plan, double c[], double f
   return cfftblue_backward(plan->blueplan,c,fct);
   }
 
-WARN_UNUSED_RESULT static int cfft_forward(cfft_plan plan, double c[], double fct)
+WARN_UNUSED_RESULT int cfft_forward(cfft_plan plan, double c[], double fct)
   {
   if (plan->packplan)
     return cfftp_forward(plan->packplan,c,fct);
@@ -2138,7 +2127,7 @@ typedef struct rfft_plan_i
   fftblue_plan blueplan;
   } rfft_plan_i;
 
-static rfft_plan make_rfft_plan (size_t length)
+rfft_plan make_rfft_plan (size_t length)
   {
   if (length==0) return NULL;
   rfft_plan plan = RALLOC(rfft_plan_i,1);
@@ -2167,7 +2156,7 @@ static rfft_plan make_rfft_plan (size_t length)
   return plan;
   }
 
-static void destroy_rfft_plan (rfft_plan plan)
+void destroy_rfft_plan (rfft_plan plan)
   {
   if (plan->blueplan)
     destroy_fftblue_plan(plan->blueplan);
@@ -2176,7 +2165,19 @@ static void destroy_rfft_plan (rfft_plan plan)
   DEALLOC(plan);
   }
 
-WARN_UNUSED_RESULT static int rfft_backward(rfft_plan plan, double c[], double fct)
+size_t rfft_length(rfft_plan plan)
+  {
+  if (plan->packplan) return plan->packplan->length;
+  return plan->blueplan->n;
+  }
+
+size_t cfft_length(cfft_plan plan)
+  {
+  if (plan->packplan) return plan->packplan->length;
+  return plan->blueplan->n;
+  }
+
+WARN_UNUSED_RESULT int rfft_backward(rfft_plan plan, double c[], double fct)
   {
   if (plan->packplan)
     return rfftp_backward(plan->packplan,c,fct);
@@ -2184,201 +2185,10 @@ WARN_UNUSED_RESULT static int rfft_backward(rfft_plan plan, double c[], double f
     return rfftblue_backward(plan->blueplan,c,fct);
   }
 
-WARN_UNUSED_RESULT static int rfft_forward(rfft_plan plan, double c[], double fct)
+WARN_UNUSED_RESULT int rfft_forward(rfft_plan plan, double c[], double fct)
   {
   if (plan->packplan)
     return rfftp_forward(plan->packplan,c,fct);
   else // if (plan->blueplan)
     return rfftblue_forward(plan->blueplan,c,fct);
   }
-
-static PyObject *
-execute_complex(PyObject *a1, int is_forward, double fct)
-{
-    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
-            PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
-            NPY_ARRAY_ENSURECOPY | NPY_ARRAY_DEFAULT |
-            NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
-            NULL);
-    if (!data) return NULL;
-
-    int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    cfft_plan plan=NULL;
-
-    int nrepeats = PyArray_SIZE(data)/npts;
-    double *dptr = (double *)PyArray_DATA(data);
-    int fail=0;
-    Py_BEGIN_ALLOW_THREADS;
-    plan = make_cfft_plan(npts);
-    if (!plan) fail=1;
-    if (!fail)
-      for (int i = 0; i < nrepeats; i++) {
-          int res = is_forward ?
-            cfft_forward(plan, dptr, fct) : cfft_backward(plan, dptr, fct);
-          if (res!=0) { fail=1; break; }
-          dptr += npts*2;
-      }
-    if (plan) destroy_cfft_plan(plan);
-    Py_END_ALLOW_THREADS;
-    if (fail) {
-      Py_XDECREF(data);
-      return PyErr_NoMemory();
-    }
-    return (PyObject *)data;
-}
-
-static PyObject *
-execute_real_forward(PyObject *a1, double fct)
-{
-    rfft_plan plan=NULL;
-    int fail = 0;
-    npy_intp tdim[NPY_MAXDIMS];
-
-    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
-            PyArray_DescrFromType(NPY_DOUBLE), 1, 0,
-            NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
-            NULL);
-    if (!data) return NULL;
-
-    int ndim = PyArray_NDIM(data);
-    const npy_intp *odim = PyArray_DIMS(data);
-    int npts = odim[ndim - 1];
-    for (int d=0; d<ndim-1; ++d)
-      tdim[d] = odim[d];
-    tdim[ndim-1] = npts/2 + 1;
-    PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(ndim,
-            tdim, PyArray_DescrFromType(NPY_CDOUBLE), 0);
-    if (!ret) fail=1;
-    if (!fail) {
-      int rstep = PyArray_DIM(ret, PyArray_NDIM(ret) - 1)*2;
-
-      int nrepeats = PyArray_SIZE(data)/npts;
-      double *rptr = (double *)PyArray_DATA(ret),
-             *dptr = (double *)PyArray_DATA(data);
-
-      Py_BEGIN_ALLOW_THREADS;
-      plan = make_rfft_plan(npts);
-      if (!plan) fail=1;
-      if (!fail)
-        for (int i = 0; i < nrepeats; i++) {
-            rptr[rstep-1] = 0.0;
-            memcpy((char *)(rptr+1), dptr, npts*sizeof(double));
-            if (rfft_forward(plan, rptr+1, fct)!=0) {fail=1; break;}
-            rptr[0] = rptr[1];
-            rptr[1] = 0.0;
-            rptr += rstep;
-            dptr += npts;
-      }
-      if (plan) destroy_rfft_plan(plan);
-      Py_END_ALLOW_THREADS;
-    }
-    if (fail) {
-      Py_XDECREF(data);
-      Py_XDECREF(ret);
-      return PyErr_NoMemory();
-    }
-    Py_DECREF(data);
-    return (PyObject *)ret;
-}
-static PyObject *
-execute_real_backward(PyObject *a1, double fct)
-{
-    rfft_plan plan=NULL;
-    PyArrayObject *data = (PyArrayObject *)PyArray_FromAny(a1,
-            PyArray_DescrFromType(NPY_CDOUBLE), 1, 0,
-            NPY_ARRAY_DEFAULT | NPY_ARRAY_ENSUREARRAY | NPY_ARRAY_FORCECAST,
-            NULL);
-    if (!data) return NULL;
-    int npts = PyArray_DIM(data, PyArray_NDIM(data) - 1);
-    PyArrayObject *ret = (PyArrayObject *)PyArray_Empty(PyArray_NDIM(data),
-            PyArray_DIMS(data), PyArray_DescrFromType(NPY_DOUBLE), 0);
-    int fail = 0;
-    if (!ret) fail=1;
-    if (!fail) {
-      int nrepeats = PyArray_SIZE(ret)/npts;
-      double *rptr = (double *)PyArray_DATA(ret),
-             *dptr = (double *)PyArray_DATA(data);
-
-      Py_BEGIN_ALLOW_THREADS;
-      plan = make_rfft_plan(npts);
-      if (!plan) fail=1;
-      if (!fail) {
-        for (int i = 0; i < nrepeats; i++) {
-          memcpy((char *)(rptr + 1), (dptr + 2), (npts - 1)*sizeof(double));
-          rptr[0] = dptr[0];
-          if (rfft_backward(plan, rptr, fct)!=0) {fail=1; break;}
-          rptr += npts;
-          dptr += npts*2;
-        }
-      }
-      if (plan) destroy_rfft_plan(plan);
-      Py_END_ALLOW_THREADS;
-    }
-    if (fail) {
-      Py_XDECREF(data);
-      Py_XDECREF(ret);
-      return PyErr_NoMemory();
-    }
-    Py_DECREF(data);
-    return (PyObject *)ret;
-}
-
-static PyObject *
-execute_real(PyObject *a1, int is_forward, double fct)
-{
-    return is_forward ? execute_real_forward(a1, fct)
-                      : execute_real_backward(a1, fct);
-}
-
-static const char execute__doc__[] = "";
-
-static PyObject *
-execute(PyObject *NPY_UNUSED(self), PyObject *args)
-{
-    PyObject *a1;
-    int is_real, is_forward;
-    double fct;
-
-    if(!PyArg_ParseTuple(args, "Oiid:execute", &a1, &is_real, &is_forward, &fct)) {
-        return NULL;
-    }
-
-    return is_real ? execute_real(a1, is_forward, fct)
-                   : execute_complex(a1, is_forward, fct);
-}
-
-/* List of methods defined in the module */
-
-static struct PyMethodDef methods[] = {
-    {"execute",   execute,   1, execute__doc__},
-    {NULL, NULL, 0, NULL}          /* sentinel */
-};
-
-static struct PyModuleDef moduledef = {
-        PyModuleDef_HEAD_INIT,
-        "_pocketfft_internal",
-        NULL,
-        -1,
-        methods,
-        NULL,
-        NULL,
-        NULL,
-        NULL
-};
-
-/* Initialization function for the module */
-PyMODINIT_FUNC PyInit__pocketfft_internal(void)
-{
-    PyObject *m;
-    m = PyModule_Create(&moduledef);
-    if (m == NULL) {
-        return NULL;
-    }
-
-    /* Import the array object */
-    import_array();
-
-    /* XXXX Add constants here */
-
-    return m;
-}
diff --git a/numpy/fft/pocketfft/pocketfft.h b/numpy/fft/pocketfft/pocketfft.h
new file mode 100644
index 000000000000..e5c4c0ebf430
--- /dev/null
+++ b/numpy/fft/pocketfft/pocketfft.h
@@ -0,0 +1,35 @@
+/*
+ * This file is part of pocketfft.
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
+ */
+
+/*! \file pocketfft.h
+ *  Public interface of the pocketfft library
+ *
+ *  Copyright (C) 2008-2018 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef POCKETFFT_H
+#define POCKETFFT_H
+
+#include <stdlib.h>
+#include "numpy/numpyconfig.h"  // for NPY_VISIBILITY_HIDDEN
+
+NPY_VISIBILITY_HIDDEN struct cfft_plan_i;
+typedef struct cfft_plan_i * cfft_plan;
+NPY_VISIBILITY_HIDDEN cfft_plan make_cfft_plan (size_t length);
+NPY_VISIBILITY_HIDDEN void destroy_cfft_plan (cfft_plan plan);
+NPY_VISIBILITY_HIDDEN int cfft_backward(cfft_plan plan, double c[], double fct);
+NPY_VISIBILITY_HIDDEN int cfft_forward(cfft_plan plan, double c[], double fct);
+NPY_VISIBILITY_HIDDEN size_t cfft_length(cfft_plan plan);
+
+NPY_VISIBILITY_HIDDEN struct rfft_plan_i;
+typedef struct rfft_plan_i * rfft_plan;
+NPY_VISIBILITY_HIDDEN rfft_plan make_rfft_plan (size_t length);
+NPY_VISIBILITY_HIDDEN void destroy_rfft_plan (rfft_plan plan);
+NPY_VISIBILITY_HIDDEN int rfft_backward(rfft_plan plan, double c[], double fct);
+NPY_VISIBILITY_HIDDEN int rfft_forward(rfft_plan plan, double c[], double fct);
+NPY_VISIBILITY_HIDDEN size_t rfft_length(rfft_plan plan);
+
+#endif
diff --git a/numpy/fft/tests/test_pocketfft.py b/numpy/fft/tests/test_pocketfft.py
index 8ed0c8fb4216..9a66b52f2f7a 100644
--- a/numpy/fft/tests/test_pocketfft.py
+++ b/numpy/fft/tests/test_pocketfft.py
@@ -42,6 +42,106 @@ def test_fft(self):
         assert_allclose(fft1(x) / 30.,
                         np.fft.fft(x, norm="forward"), atol=1e-6)
 
+    @pytest.mark.parametrize("axis", (0, 1))
+    @pytest.mark.parametrize("dtype", (complex, float))
+    @pytest.mark.parametrize("transpose", (True, False))
+    def test_fft_out_argument(self, dtype, transpose, axis):
+        def zeros_like(x):
+            if transpose:
+                return np.zeros_like(x.T).T
+            else:
+                return np.zeros_like(x)
+
+        # tests below only test the out parameter
+        if dtype is complex:
+            y = random((10, 20)) + 1j*random((10, 20))
+            fft, ifft = np.fft.fft, np.fft.ifft
+        else:
+            y = random((10, 20))
+            fft, ifft = np.fft.rfft, np.fft.irfft
+
+        expected = fft(y, axis=axis)
+        out = zeros_like(expected)
+        result = fft(y, out=out, axis=axis)
+        assert result is out
+        assert_array_equal(result, expected)
+
+        expected2 = ifft(expected, axis=axis)
+        out2 = out if dtype is complex else zeros_like(expected2)
+        result2 = ifft(out, out=out2, axis=axis)
+        assert result2 is out2
+        assert_array_equal(result2, expected2)
+
+    @pytest.mark.parametrize("axis", [0, 1])
+    def test_fft_inplace_out(self, axis):
+        # Test some weirder in-place combinations
+        y = random((20, 20)) + 1j*random((20, 20))
+        # Fully in-place.
+        y1 = y.copy()
+        expected1 = np.fft.fft(y1, axis=axis)
+        result1 = np.fft.fft(y1, axis=axis, out=y1)
+        assert result1 is y1
+        assert_array_equal(result1, expected1)
+        # In-place of part of the array; rest should be unchanged.
+        y2 = y.copy()
+        out2 = y2[:10] if axis == 0 else y2[:, :10]
+        expected2 = np.fft.fft(y2, n=10, axis=axis)
+        result2 = np.fft.fft(y2, n=10, axis=axis, out=out2)
+        assert result2 is out2
+        assert_array_equal(result2, expected2)
+        if axis == 0:
+            assert_array_equal(y2[10:], y[10:])
+        else:
+            assert_array_equal(y2[:, 10:], y[:, 10:])
+        # In-place of another part of the array.
+        y3 = y.copy()
+        y3_sel = y3[5:] if axis == 0 else y3[:, 5:]
+        out3 = y3[5:15] if axis == 0 else y3[:, 5:15]
+        expected3 = np.fft.fft(y3_sel, n=10, axis=axis)
+        result3 = np.fft.fft(y3_sel, n=10, axis=axis, out=out3)
+        assert result3 is out3
+        assert_array_equal(result3, expected3)
+        if axis == 0:
+            assert_array_equal(y3[:5], y[:5])
+            assert_array_equal(y3[15:], y[15:])
+        else:
+            assert_array_equal(y3[:, :5], y[:, :5])
+            assert_array_equal(y3[:, 15:], y[:, 15:])
+        # In-place with n > nin; rest should be unchanged.
+        y4 = y.copy()
+        y4_sel = y[:10] if axis == 0 else y4[:, :10]
+        out4 = y4[:15] if axis == 0 else y4[:, :15]
+        expected4 = np.fft.fft(y4_sel, n=15, axis=axis)
+        result4 = np.fft.fft(y4_sel, n=15, axis=axis, out=out4)
+        assert result4 is out4
+        assert_array_equal(result4, expected4)
+        if axis == 0:
+            assert_array_equal(y4[15:], y[15:])
+        else:
+            assert_array_equal(y4[:, 15:], y[:, 15:])
+        # Overwrite in a transpose.
+        y5 = y.copy()
+        out5 = y5.T
+        result5 = np.fft.fft(y5, axis=axis, out=out5)
+        assert result5 is out5
+        assert_array_equal(result5, expected1)
+        # Reverse strides.
+        y6 = y.copy()
+        out6 = y6[::-1] if axis == 0 else y6[:, ::-1]
+        result6 = np.fft.fft(y6, axis=axis, out=out6)
+        assert result6 is out6
+        assert_array_equal(result6, expected1)
+
+    def test_fft_bad_out(self):
+        x = np.arange(30.)
+        with pytest.raises(TypeError, match="must be of ArrayType"):
+            np.fft.fft(x, out="")
+        with pytest.raises(ValueError, match="has wrong shape"):
+            np.fft.fft(x, out=np.zeros_like(x).reshape(5, -1))
+        with pytest.raises(TypeError, match="Cannot cast"):
+            np.fft.fft(x, out=np.zeros_like(x, dtype=float))
+
+
     @pytest.mark.parametrize('norm', (None, 'backward', 'ortho', 'forward'))
     def test_ifft(self, norm):
         x = random(30) + 1j*random(30)
@@ -256,6 +356,64 @@ def test_dtypes(self, dtype):
         assert_allclose(np.fft.ifft(np.fft.fft(x)), x, atol=1e-6)
         assert_allclose(np.fft.irfft(np.fft.rfft(x)), x, atol=1e-6)
 
+    @pytest.mark.parametrize("axes", [(0, 1), (0, 2), None])
+    @pytest.mark.parametrize("dtype", (complex, float))
+    @pytest.mark.parametrize("transpose", (True, False))
+    def test_fftn_out_argument(self, dtype, transpose, axes):
+        def zeros_like(x):
+            if transpose:
+                return np.zeros_like(x.T).T
+            else:
+                return np.zeros_like(x)
+
+        # tests below only test the out parameter
+        if dtype is complex:
+            x = random((10, 5, 6)) + 1j*random((10, 5, 6))
+            fft, ifft = np.fft.fftn, np.fft.ifftn
+        else:
+            x = random((10, 5, 6))
+            fft, ifft = np.fft.rfftn, np.fft.irfftn
+
+        expected = fft(x, axes=axes)
+        out = zeros_like(expected)
+        result = fft(x, out=out, axes=axes)
+        assert result is out
+        assert_array_equal(result, expected)
+
+        expected2 = ifft(expected, axes=axes)
+        out2 = out if dtype is complex else zeros_like(expected2)
+        result2 = ifft(out, out=out2, axes=axes)
+        assert result2 is out2
+        assert_array_equal(result2, expected2)
+
+    @pytest.mark.parametrize("fft", [np.fft.fftn, np.fft.ifftn, np.fft.rfftn])
+    def test_fftn_out_and_s_interaction(self, fft):
+        # With s, shape varies, so generally one cannot pass in out.
+        if fft is np.fft.rfftn:
+            x = random((10, 5, 6))
+        else:
+            x = random((10, 5, 6)) + 1j*random((10, 5, 6))
+        with pytest.raises(ValueError, match="has wrong shape"):
+            fft(x, out=np.zeros_like(x), s=(3, 3, 3), axes=(0, 1, 2))
+        # Except on the first axis done (which is the last of axes).
+        s = (10, 5, 5)
+        expected = fft(x, s=s, axes=(0, 1, 2))
+        out = np.zeros_like(expected)
+        result = fft(x, s=s, axes=(0, 1, 2), out=out)
+        assert result is out
+        assert_array_equal(result, expected)
+
+    @pytest.mark.parametrize("s", [(9, 5, 5), (3, 3, 3)])
+    def test_irfftn_out_and_s_interaction(self, s):
+        # Since for irfftn, the output is real and thus cannot be used for
+        # intermediate steps, it should always work.
+        x = random((9, 5, 6, 2)) + 1j*random((9, 5, 6, 2))
+        expected = np.fft.irfftn(x, s=s, axes=(0, 1, 2))
+        out = np.zeros_like(expected)
+        result = np.fft.irfftn(x, s=s, axes=(0, 1, 2), out=out)
+        assert result is out
+        assert_array_equal(result, expected)
+
 
 @pytest.mark.parametrize(
         "dtype",