gh-119609: Add PyUnicode_AsNativeFormat() function

vstinner · vstinner · commit f6b67a37b930 · 2024-05-27T17:46:22.000+02:00
Add PyUnicode_AsNativeFormat() and PyUnicode_FromNativeFormat()
functions to the C API.
diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst
@@ -341,6 +341,53 @@ APIs:
    .. versionadded:: 3.3
 
 
+.. c:function:: const void* PyUnicode_AsNativeFormat(PyObject *unicode, Py_ssize_t *size, int *native_format)
+
+   Get the contents of a string in its native format.
+
+   * Return the contents, set *\*size* and *\*native_format* on success.
+   * Set an exception and return ``NULL`` on error.
+
+   The contents is valid as long as *unicode* is valid.
+
+   *unicode*, *size* and *native_format* must not be NULL.
+
+   *\*native_format* is set to one of these native formats:
+
+   .. c:namespace:: NULL
+
+   ========================================  =====  ============================
+   Constant Identifier                       Value  Description
+   ========================================  =====  ============================
+   .. c:macro:: PyUnicode_NATIVE_ASCII       ``1``  ASCII string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_NATIVE_UCS1        ``2``  UCS-1 string (``Py_UCS1*``)
+   .. c:macro:: PyUnicode_NATIVE_UCS2        ``3``  UCS-2 string (``Py_UCS2*``)
+   .. c:macro:: PyUnicode_NATIVE_UCS4        ``4``  UCS-4 string (``Py_UCS4*``)
+   .. c:macro:: PyUnicode_NATIVE_UTF8        ``5``  UTF-8 string (``char*``)
+   ========================================  =====  ============================
+
+   .. impl-detail::
+      In CPython, the :c:macro:`PyUnicode_NATIVE_UTF8` format is not used by
+      :c:func:`PyUnicode_AsNativeFormat`, but it's accepted by
+      :c:func:`PyUnicode_FromNativeFormat`.
+
+   .. versionadded:: 3.14
+
+
+.. c:function:: PyObject* PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, int native_format)
+
+   Create a string object from a native format string.
+
+   * Return a reference to a new string object on success.
+   * Set an exception and return ``NULL`` on error.
+
+   *data* must not be NULL. *size* must be positive or zero.
+
+   See :c:func:`PyUnicode_AsNativeFormat` for the available native formats.
+
+   .. versionadded:: 3.14
+
+
 .. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
                                                     Py_ssize_t size)
 
diff --git a/Doc/data/stable_abi.dat b/Doc/data/stable_abi.dat
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst
@@ -217,6 +217,12 @@ C API Changes
 New Features
 ------------
 
+* Add :c:func:`PyUnicode_AsNativeFormat` and
+  :c:func:`PyUnicode_FromNativeFormat` functions to import and export strings
+  in their native format.
+  (Contributed by Victor Stinner in :gh:`119609`.)
+
+
 Porting to Python 3.14
 ----------------------
 
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h
@@ -248,6 +248,28 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
     const char *u              /* UTF-8 encoded string */
     );
 
+#define PyUnicode_NATIVE_ASCII 1
+#define PyUnicode_NATIVE_UCS1 2
+#define PyUnicode_NATIVE_UCS2 3
+#define PyUnicode_NATIVE_UCS4 4
+#define PyUnicode_NATIVE_UTF8 5
+
+// Get the content of a string in its native format.
+// - Return the content, set '*size' and '*native_format' on success.
+// - Set an exception and return NULL on error.
+PyAPI_FUNC(const void*) PyUnicode_AsNativeFormat(
+    PyObject *unicode,
+    Py_ssize_t *size,
+    int *native_format);
+
+// Create a string object from a native format string.
+// - Return a reference to a new string object on success.
+// - Set an exception and return NULL on error.
+PyAPI_FUNC(PyObject*) PyUnicode_FromNativeFormat(
+    const void *data,
+    Py_ssize_t size,
+    int native_format);
+
 /* --- wchar_t support for platforms which support it --------------------- */
 
 #ifdef HAVE_WCHAR_H
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py
@@ -24,6 +24,14 @@ class Str(str):
     pass
 
 
+PyUnicode_NATIVE_ASCII = 1
+PyUnicode_NATIVE_UCS1 = 2
+PyUnicode_NATIVE_UCS2 = 3
+PyUnicode_NATIVE_UCS4 = 4
+PyUnicode_NATIVE_UTF8 = 5
+# Invalid native format
+PyUnicode_NATIVE_INVALID = 0
+
 class CAPITest(unittest.TestCase):
 
     @support.cpython_only
@@ -1675,6 +1683,75 @@ def test_pep393_utf8_caching_bug(self):
                 # Check that the second call returns the same result
                 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
 
-
-if __name__ == "__main__":
+    def test_unicode_asnativeformat(self):
+        # Test PyUnicode_AsNativeFormat()
+        asnativeformat = _testlimitedcapi.unicode_asnativeformat
+        self.assertEqual(asnativeformat("abc"),
+                         (b'abc', PyUnicode_NATIVE_ASCII))
+        self.assertEqual(asnativeformat("latin1:\xe9"),
+                         (b'latin1:\xe9', PyUnicode_NATIVE_UCS1))
+
+        ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
+        self.assertEqual(asnativeformat('ucs2:\u20ac'),
+                         ('ucs2:\u20ac'.encode(ucs2_enc),
+                          PyUnicode_NATIVE_UCS2))
+
+        ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
+        self.assertEqual(asnativeformat('ucs4:\U0010ffff'),
+                         ('ucs4:\U0010ffff'.encode(ucs4_enc),
+                          PyUnicode_NATIVE_UCS4))
+
+    def test_unicode_fromnativeformat(self):
+        # Test PyUnicode_FromNativeFormat()
+        fromnativeformat = _testlimitedcapi.unicode_fromnativeformat
+        self.assertEqual(fromnativeformat(b'abc', PyUnicode_NATIVE_ASCII),
+                         "abc")
+        self.assertEqual(fromnativeformat(b'latin1:\xe9', PyUnicode_NATIVE_UCS1),
+                         "latin1:\xe9")
+
+        ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
+        self.assertEqual(fromnativeformat('ucs2:\u20ac'.encode(ucs2_enc),
+                                          PyUnicode_NATIVE_UCS2),
+                         'ucs2:\u20ac')
+
+        ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
+        self.assertEqual(fromnativeformat('ucs4:\U0010ffff'.encode(ucs4_enc),
+                                          PyUnicode_NATIVE_UCS4),
+                         'ucs4:\U0010ffff')
+
+        text = "abc\xe9\U0010ffff"
+        self.assertEqual(fromnativeformat(text.encode('utf8'),
+                                          PyUnicode_NATIVE_UTF8),
+                         text)
+
+        # Empty string
+        for native_format in (
+            PyUnicode_NATIVE_ASCII,
+            PyUnicode_NATIVE_UCS1,
+            PyUnicode_NATIVE_UCS2,
+            PyUnicode_NATIVE_UCS4,
+            PyUnicode_NATIVE_UTF8,
+        ):
+            with self.subTest(native_format=native_format):
+                self.assertEqual(fromnativeformat(b'', native_format),
+                                 '')
+
+        # Invalid format
+        with self.assertRaises(ValueError):
+            fromnativeformat(b'', PyUnicode_NATIVE_INVALID)
+
+        # Invalid size
+        ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
+        with self.assertRaises(ValueError):
+            fromnativeformat(ucs2[:-1], PyUnicode_NATIVE_UCS2)
+        ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
+        with self.assertRaises(ValueError):
+            fromnativeformat(ucs4[:-1], PyUnicode_NATIVE_UCS4)
+        with self.assertRaises(ValueError):
+            fromnativeformat(ucs4[:-2], PyUnicode_NATIVE_UCS4)
+        with self.assertRaises(ValueError):
+            fromnativeformat(ucs4[:-3], PyUnicode_NATIVE_UCS4)
+
+
+if __name__ == '__main__':
     unittest.main()
diff --git a/Lib/test/test_stable_abi_ctypes.py b/Lib/test/test_stable_abi_ctypes.py
diff --git a/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst b/Misc/NEWS.d/next/C API/2024-05-27-17-46-17.gh-issue-119609.kPIx6S.rst
@@ -0,0 +1,3 @@
+Add :c:func:`PyUnicode_AsNativeFormat` and
+:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings
+in their native format. Patch by Victor Stinner.
diff --git a/Misc/stable_abi.toml b/Misc/stable_abi.toml
@@ -2507,3 +2507,7 @@
     added = '3.13'
 [function.PyEval_GetFrameLocals]
     added = '3.13'
+[function.PyUnicode_AsNativeFormat]
+    added = '3.14'
+[function.PyUnicode_FromNativeFormat]
+    added = '3.14'
diff --git a/Modules/_testlimitedcapi/unicode.c b/Modules/_testlimitedcapi/unicode.c
@@ -1837,6 +1837,35 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
 #undef CHECK_FORMAT_0
 }
 
+
+// Test PyUnicode_AsNativeFormat()
+static PyObject*
+unicode_asnativeformat(PyObject *self, PyObject *obj)
+{
+    Py_ssize_t size;
+    int native_format;
+    const void *data = PyUnicode_AsNativeFormat(obj, &size, &native_format);
+    if (data == NULL) {
+        return NULL;
+    }
+    return Py_BuildValue("y#i", data, size, native_format);
+}
+
+
+// Test PyUnicode_FromNativeFormat()
+static PyObject*
+unicode_fromnativeformat(PyObject *self, PyObject *args)
+{
+    const void *data;
+    Py_ssize_t size;
+    int native_format;
+    if (!PyArg_ParseTuple(args, "y#i", &data, &size, &native_format)) {
+        return NULL;
+    }
+    return PyUnicode_FromNativeFormat(data, size, native_format);
+}
+
+
 static PyMethodDef TestMethods[] = {
     {"codec_incrementalencoder", codec_incrementalencoder,       METH_VARARGS},
     {"codec_incrementaldecoder", codec_incrementaldecoder,       METH_VARARGS},
@@ -1924,6 +1953,8 @@ static PyMethodDef TestMethods[] = {
     {"unicode_format",           unicode_format,                 METH_VARARGS},
     {"unicode_contains",         unicode_contains,               METH_VARARGS},
     {"unicode_isidentifier",     unicode_isidentifier,           METH_O},
+    {"unicode_asnativeformat",   unicode_asnativeformat,         METH_O},
+    {"unicode_fromnativeformat", unicode_fromnativeformat,       METH_VARARGS},
     {NULL},
 };
 
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
@@ -2094,6 +2094,89 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
     return res;
 }
 
+const void*
+PyUnicode_AsNativeFormat(PyObject *unicode,
+                         Py_ssize_t *size, int *native_format)
+{
+    if (!PyUnicode_Check(unicode)) {
+        *size = 0;
+        *native_format = 0;
+        PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode);
+        return NULL;
+    }
+
+    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
+
+    if (PyUnicode_IS_ASCII(unicode)) {
+        *native_format = PyUnicode_NATIVE_ASCII;
+        *size = len;
+        return PyUnicode_1BYTE_DATA(unicode);
+    }
+    int kind = PyUnicode_KIND(unicode);
+
+    switch (kind)
+    {
+    case PyUnicode_1BYTE_KIND:
+        *native_format = PyUnicode_NATIVE_UCS1;
+        *size = len;
+        return PyUnicode_1BYTE_DATA(unicode);
+
+    case PyUnicode_2BYTE_KIND:
+        *native_format = PyUnicode_NATIVE_UCS2;
+        *size = len * 2;
+        return PyUnicode_2BYTE_DATA(unicode);
+
+    default:
+        assert(kind == PyUnicode_4BYTE_KIND);
+        *native_format = PyUnicode_NATIVE_UCS4;
+        *size = len * 4;
+        return PyUnicode_4BYTE_DATA(unicode);
+    }
+}
+
+PyObject*
+PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size,
+                           int native_format)
+{
+    if (size < 0) {
+        PyErr_SetString(PyExc_ValueError, "Negative size");
+        return NULL;
+    }
+
+    switch (native_format)
+    {
+    case PyUnicode_NATIVE_ASCII:
+        return PyUnicode_DecodeASCII((const char*)data, size, NULL);
+
+    case PyUnicode_NATIVE_UCS1:
+        return _PyUnicode_FromUCS1(data, size);
+
+    case PyUnicode_NATIVE_UCS2:
+        if (size % 2) {
+            PyErr_Format(PyExc_ValueError, "size must be a multiple of 2: %zd",
+                         size);
+            return NULL;
+        }
+        return _PyUnicode_FromUCS2(data, size / 2);
+
+    case PyUnicode_NATIVE_UCS4:
+        if (size % 4) {
+            PyErr_Format(PyExc_ValueError, "size must be a multiple of 4: %zd",
+                         size);
+            return NULL;
+        }
+        return _PyUnicode_FromUCS4(data, size / 4);
+
+    case PyUnicode_NATIVE_UTF8:
+        return PyUnicode_DecodeUTF8((const char*)data, size, NULL);
+
+    default:
+        PyErr_Format(PyExc_ValueError, "unknown native format %i",
+                     native_format);
+        return NULL;
+    }
+}
+
 PyObject*
 PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
 {
diff --git a/PC/python3dll.c b/PC/python3dll.c

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+Add :c:func:`PyUnicode_AsNativeFormat` and
	`2`	+:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings
	`3`	`+in their native format. Patch by Victor Stinner.`