Skip to content

Commit f6b67a3

Browse files
committed
gh-119609: Add PyUnicode_AsNativeFormat() function
Add PyUnicode_AsNativeFormat() and PyUnicode_FromNativeFormat() functions to the C API.
1 parent 5482a93 commit f6b67a3

File tree

11 files changed

+281
-2
lines changed

11 files changed

+281
-2
lines changed

Doc/c-api/unicode.rst

+47
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,53 @@ APIs:
341341
.. versionadded:: 3.3
342342
343343
344+
.. c:function:: const void* PyUnicode_AsNativeFormat(PyObject *unicode, Py_ssize_t *size, int *native_format)
345+
346+
Get the contents of a string in its native format.
347+
348+
* Return the contents, set *\*size* and *\*native_format* on success.
349+
* Set an exception and return ``NULL`` on error.
350+
351+
The contents is valid as long as *unicode* is valid.
352+
353+
*unicode*, *size* and *native_format* must not be NULL.
354+
355+
*\*native_format* is set to one of these native formats:
356+
357+
.. c:namespace:: NULL
358+
359+
======================================== ===== ============================
360+
Constant Identifier Value Description
361+
======================================== ===== ============================
362+
.. c:macro:: PyUnicode_NATIVE_ASCII ``1`` ASCII string (``Py_UCS1*``)
363+
.. c:macro:: PyUnicode_NATIVE_UCS1 ``2`` UCS-1 string (``Py_UCS1*``)
364+
.. c:macro:: PyUnicode_NATIVE_UCS2 ``3`` UCS-2 string (``Py_UCS2*``)
365+
.. c:macro:: PyUnicode_NATIVE_UCS4 ``4`` UCS-4 string (``Py_UCS4*``)
366+
.. c:macro:: PyUnicode_NATIVE_UTF8 ``5`` UTF-8 string (``char*``)
367+
======================================== ===== ============================
368+
369+
.. impl-detail::
370+
In CPython, the :c:macro:`PyUnicode_NATIVE_UTF8` format is not used by
371+
:c:func:`PyUnicode_AsNativeFormat`, but it's accepted by
372+
:c:func:`PyUnicode_FromNativeFormat`.
373+
374+
.. versionadded:: 3.14
375+
376+
377+
.. c:function:: PyObject* PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size, int native_format)
378+
379+
Create a string object from a native format string.
380+
381+
* Return a reference to a new string object on success.
382+
* Set an exception and return ``NULL`` on error.
383+
384+
*data* must not be NULL. *size* must be positive or zero.
385+
386+
See :c:func:`PyUnicode_AsNativeFormat` for the available native formats.
387+
388+
.. versionadded:: 3.14
389+
390+
344391
.. c:function:: PyObject* PyUnicode_FromKindAndData(int kind, const void *buffer, \
345392
Py_ssize_t size)
346393

Doc/data/stable_abi.dat

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Doc/whatsnew/3.14.rst

+6
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,12 @@ C API Changes
217217
New Features
218218
------------
219219

220+
* Add :c:func:`PyUnicode_AsNativeFormat` and
221+
:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings
222+
in their native format.
223+
(Contributed by Victor Stinner in :gh:`119609`.)
224+
225+
220226
Porting to Python 3.14
221227
----------------------
222228

Include/unicodeobject.h

+22
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,28 @@ PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
248248
const char *u /* UTF-8 encoded string */
249249
);
250250

251+
#define PyUnicode_NATIVE_ASCII 1
252+
#define PyUnicode_NATIVE_UCS1 2
253+
#define PyUnicode_NATIVE_UCS2 3
254+
#define PyUnicode_NATIVE_UCS4 4
255+
#define PyUnicode_NATIVE_UTF8 5
256+
257+
// Get the content of a string in its native format.
258+
// - Return the content, set '*size' and '*native_format' on success.
259+
// - Set an exception and return NULL on error.
260+
PyAPI_FUNC(const void*) PyUnicode_AsNativeFormat(
261+
PyObject *unicode,
262+
Py_ssize_t *size,
263+
int *native_format);
264+
265+
// Create a string object from a native format string.
266+
// - Return a reference to a new string object on success.
267+
// - Set an exception and return NULL on error.
268+
PyAPI_FUNC(PyObject*) PyUnicode_FromNativeFormat(
269+
const void *data,
270+
Py_ssize_t size,
271+
int native_format);
272+
251273
/* --- wchar_t support for platforms which support it --------------------- */
252274

253275
#ifdef HAVE_WCHAR_H

Lib/test/test_capi/test_unicode.py

+79-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@ class Str(str):
2424
pass
2525

2626

27+
PyUnicode_NATIVE_ASCII = 1
28+
PyUnicode_NATIVE_UCS1 = 2
29+
PyUnicode_NATIVE_UCS2 = 3
30+
PyUnicode_NATIVE_UCS4 = 4
31+
PyUnicode_NATIVE_UTF8 = 5
32+
# Invalid native format
33+
PyUnicode_NATIVE_INVALID = 0
34+
2735
class CAPITest(unittest.TestCase):
2836

2937
@support.cpython_only
@@ -1675,6 +1683,75 @@ def test_pep393_utf8_caching_bug(self):
16751683
# Check that the second call returns the same result
16761684
self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
16771685

1678-
1679-
if __name__ == "__main__":
1686+
def test_unicode_asnativeformat(self):
1687+
# Test PyUnicode_AsNativeFormat()
1688+
asnativeformat = _testlimitedcapi.unicode_asnativeformat
1689+
self.assertEqual(asnativeformat("abc"),
1690+
(b'abc', PyUnicode_NATIVE_ASCII))
1691+
self.assertEqual(asnativeformat("latin1:\xe9"),
1692+
(b'latin1:\xe9', PyUnicode_NATIVE_UCS1))
1693+
1694+
ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
1695+
self.assertEqual(asnativeformat('ucs2:\u20ac'),
1696+
('ucs2:\u20ac'.encode(ucs2_enc),
1697+
PyUnicode_NATIVE_UCS2))
1698+
1699+
ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
1700+
self.assertEqual(asnativeformat('ucs4:\U0010ffff'),
1701+
('ucs4:\U0010ffff'.encode(ucs4_enc),
1702+
PyUnicode_NATIVE_UCS4))
1703+
1704+
def test_unicode_fromnativeformat(self):
1705+
# Test PyUnicode_FromNativeFormat()
1706+
fromnativeformat = _testlimitedcapi.unicode_fromnativeformat
1707+
self.assertEqual(fromnativeformat(b'abc', PyUnicode_NATIVE_ASCII),
1708+
"abc")
1709+
self.assertEqual(fromnativeformat(b'latin1:\xe9', PyUnicode_NATIVE_UCS1),
1710+
"latin1:\xe9")
1711+
1712+
ucs2_enc = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
1713+
self.assertEqual(fromnativeformat('ucs2:\u20ac'.encode(ucs2_enc),
1714+
PyUnicode_NATIVE_UCS2),
1715+
'ucs2:\u20ac')
1716+
1717+
ucs4_enc = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
1718+
self.assertEqual(fromnativeformat('ucs4:\U0010ffff'.encode(ucs4_enc),
1719+
PyUnicode_NATIVE_UCS4),
1720+
'ucs4:\U0010ffff')
1721+
1722+
text = "abc\xe9\U0010ffff"
1723+
self.assertEqual(fromnativeformat(text.encode('utf8'),
1724+
PyUnicode_NATIVE_UTF8),
1725+
text)
1726+
1727+
# Empty string
1728+
for native_format in (
1729+
PyUnicode_NATIVE_ASCII,
1730+
PyUnicode_NATIVE_UCS1,
1731+
PyUnicode_NATIVE_UCS2,
1732+
PyUnicode_NATIVE_UCS4,
1733+
PyUnicode_NATIVE_UTF8,
1734+
):
1735+
with self.subTest(native_format=native_format):
1736+
self.assertEqual(fromnativeformat(b'', native_format),
1737+
'')
1738+
1739+
# Invalid format
1740+
with self.assertRaises(ValueError):
1741+
fromnativeformat(b'', PyUnicode_NATIVE_INVALID)
1742+
1743+
# Invalid size
1744+
ucs2 = 'ucs2:\u20ac'.encode(ucs2_enc)
1745+
with self.assertRaises(ValueError):
1746+
fromnativeformat(ucs2[:-1], PyUnicode_NATIVE_UCS2)
1747+
ucs4 = 'ucs4:\U0010ffff'.encode(ucs4_enc)
1748+
with self.assertRaises(ValueError):
1749+
fromnativeformat(ucs4[:-1], PyUnicode_NATIVE_UCS4)
1750+
with self.assertRaises(ValueError):
1751+
fromnativeformat(ucs4[:-2], PyUnicode_NATIVE_UCS4)
1752+
with self.assertRaises(ValueError):
1753+
fromnativeformat(ucs4[:-3], PyUnicode_NATIVE_UCS4)
1754+
1755+
1756+
if __name__ == '__main__':
16801757
unittest.main()

Lib/test/test_stable_abi_ctypes.py

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Add :c:func:`PyUnicode_AsNativeFormat` and
2+
:c:func:`PyUnicode_FromNativeFormat` functions to import and export strings
3+
in their native format. Patch by Victor Stinner.

Misc/stable_abi.toml

+4
Original file line numberDiff line numberDiff line change
@@ -2507,3 +2507,7 @@
25072507
added = '3.13'
25082508
[function.PyEval_GetFrameLocals]
25092509
added = '3.13'
2510+
[function.PyUnicode_AsNativeFormat]
2511+
added = '3.14'
2512+
[function.PyUnicode_FromNativeFormat]
2513+
added = '3.14'

Modules/_testlimitedcapi/unicode.c

+31
Original file line numberDiff line numberDiff line change
@@ -1837,6 +1837,35 @@ test_string_from_format(PyObject *self, PyObject *Py_UNUSED(ignored))
18371837
#undef CHECK_FORMAT_0
18381838
}
18391839

1840+
1841+
// Test PyUnicode_AsNativeFormat()
1842+
static PyObject*
1843+
unicode_asnativeformat(PyObject *self, PyObject *obj)
1844+
{
1845+
Py_ssize_t size;
1846+
int native_format;
1847+
const void *data = PyUnicode_AsNativeFormat(obj, &size, &native_format);
1848+
if (data == NULL) {
1849+
return NULL;
1850+
}
1851+
return Py_BuildValue("y#i", data, size, native_format);
1852+
}
1853+
1854+
1855+
// Test PyUnicode_FromNativeFormat()
1856+
static PyObject*
1857+
unicode_fromnativeformat(PyObject *self, PyObject *args)
1858+
{
1859+
const void *data;
1860+
Py_ssize_t size;
1861+
int native_format;
1862+
if (!PyArg_ParseTuple(args, "y#i", &data, &size, &native_format)) {
1863+
return NULL;
1864+
}
1865+
return PyUnicode_FromNativeFormat(data, size, native_format);
1866+
}
1867+
1868+
18401869
static PyMethodDef TestMethods[] = {
18411870
{"codec_incrementalencoder", codec_incrementalencoder, METH_VARARGS},
18421871
{"codec_incrementaldecoder", codec_incrementaldecoder, METH_VARARGS},
@@ -1924,6 +1953,8 @@ static PyMethodDef TestMethods[] = {
19241953
{"unicode_format", unicode_format, METH_VARARGS},
19251954
{"unicode_contains", unicode_contains, METH_VARARGS},
19261955
{"unicode_isidentifier", unicode_isidentifier, METH_O},
1956+
{"unicode_asnativeformat", unicode_asnativeformat, METH_O},
1957+
{"unicode_fromnativeformat", unicode_fromnativeformat, METH_VARARGS},
19271958
{NULL},
19281959
};
19291960

Objects/unicodeobject.c

+83
Original file line numberDiff line numberDiff line change
@@ -2094,6 +2094,89 @@ _PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
20942094
return res;
20952095
}
20962096

2097+
const void*
2098+
PyUnicode_AsNativeFormat(PyObject *unicode,
2099+
Py_ssize_t *size, int *native_format)
2100+
{
2101+
if (!PyUnicode_Check(unicode)) {
2102+
*size = 0;
2103+
*native_format = 0;
2104+
PyErr_Format(PyExc_TypeError, "must be str, not %T", unicode);
2105+
return NULL;
2106+
}
2107+
2108+
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
2109+
2110+
if (PyUnicode_IS_ASCII(unicode)) {
2111+
*native_format = PyUnicode_NATIVE_ASCII;
2112+
*size = len;
2113+
return PyUnicode_1BYTE_DATA(unicode);
2114+
}
2115+
int kind = PyUnicode_KIND(unicode);
2116+
2117+
switch (kind)
2118+
{
2119+
case PyUnicode_1BYTE_KIND:
2120+
*native_format = PyUnicode_NATIVE_UCS1;
2121+
*size = len;
2122+
return PyUnicode_1BYTE_DATA(unicode);
2123+
2124+
case PyUnicode_2BYTE_KIND:
2125+
*native_format = PyUnicode_NATIVE_UCS2;
2126+
*size = len * 2;
2127+
return PyUnicode_2BYTE_DATA(unicode);
2128+
2129+
default:
2130+
assert(kind == PyUnicode_4BYTE_KIND);
2131+
*native_format = PyUnicode_NATIVE_UCS4;
2132+
*size = len * 4;
2133+
return PyUnicode_4BYTE_DATA(unicode);
2134+
}
2135+
}
2136+
2137+
PyObject*
2138+
PyUnicode_FromNativeFormat(const void *data, Py_ssize_t size,
2139+
int native_format)
2140+
{
2141+
if (size < 0) {
2142+
PyErr_SetString(PyExc_ValueError, "Negative size");
2143+
return NULL;
2144+
}
2145+
2146+
switch (native_format)
2147+
{
2148+
case PyUnicode_NATIVE_ASCII:
2149+
return PyUnicode_DecodeASCII((const char*)data, size, NULL);
2150+
2151+
case PyUnicode_NATIVE_UCS1:
2152+
return _PyUnicode_FromUCS1(data, size);
2153+
2154+
case PyUnicode_NATIVE_UCS2:
2155+
if (size % 2) {
2156+
PyErr_Format(PyExc_ValueError, "size must be a multiple of 2: %zd",
2157+
size);
2158+
return NULL;
2159+
}
2160+
return _PyUnicode_FromUCS2(data, size / 2);
2161+
2162+
case PyUnicode_NATIVE_UCS4:
2163+
if (size % 4) {
2164+
PyErr_Format(PyExc_ValueError, "size must be a multiple of 4: %zd",
2165+
size);
2166+
return NULL;
2167+
}
2168+
return _PyUnicode_FromUCS4(data, size / 4);
2169+
2170+
case PyUnicode_NATIVE_UTF8:
2171+
return PyUnicode_DecodeUTF8((const char*)data, size, NULL);
2172+
2173+
default:
2174+
PyErr_Format(PyExc_ValueError, "unknown native format %i",
2175+
native_format);
2176+
return NULL;
2177+
}
2178+
}
2179+
20972180
PyObject*
20982181
PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
20992182
{

PC/python3dll.c

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)