diff --git a/Doc/library/stdtypes.rst b/Doc/library/stdtypes.rst index f0b4b09ff10dce..a5a0adc93740c3 100644 --- a/Doc/library/stdtypes.rst +++ b/Doc/library/stdtypes.rst @@ -1612,6 +1612,8 @@ category. | | :meth:`str.isspace` | :meth:`bytes.isspace` | | +-------------------------------------------+---------------------------------------------------+ | | :meth:`str.isprintable` | | +| +-------------------------------------------+---------------------------------------------------+ +| | :meth:`str.contains_surrogate` | | +--------------------------+-------------------------------------------+---------------------------------------------------+ | Case Manipulation | :meth:`str.lower` | :meth:`bytes.lower` | | +-------------------------------------------+---------------------------------------------------+ @@ -2078,6 +2080,15 @@ expression support in the :mod:`re` module). False +.. method:: str.contains_surrogate() + + Return ``True`` if the string contains any surrogate code points, + ``False`` otherwise. + + >>> 'notasurrogate'.contains_surrogate() + False + >>> '\ud83d\udc0d'.contains_surrogates() + True .. _meth-str-join: diff --git a/Lib/collections/__init__.py b/Lib/collections/__init__.py index d2ddc1cd9ec2ea..b1beec5fef0455 100644 --- a/Lib/collections/__init__.py +++ b/Lib/collections/__init__.py @@ -1533,6 +1533,9 @@ def istitle(self): def isupper(self): return self.data.isupper() + def contains_surrogate(self): + return self.data.contains_surrogate() + def join(self, seq): return self.data.join(seq) diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index d6a7bd0da59910..180edb928e59a2 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -862,17 +862,26 @@ def test_isprintable_invariant(self): category[0] not in ('C', 'Z') or char == ' ') + def test_contains_surrogates(self): + self.assertFalse("hello".contains_surrogate()) + self.assertFalse("".contains_surrogate()) + self.assertTrue("\udc80".contains_surrogate()) + self.assertTrue("\ud800\udfff".contains_surrogate()) + self.assertTrue("\ud83d\udc0d".contains_surrogate()) + def test_surrogates(self): for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800', 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): self.assertTrue(s.islower()) self.assertFalse(s.isupper()) self.assertFalse(s.istitle()) + self.assertTrue(s.contains_surrogate()) for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800', 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'): self.assertFalse(s.islower()) self.assertTrue(s.isupper()) self.assertTrue(s.istitle()) + self.assertTrue(s.contains_surrogate()) for meth_name in ('islower', 'isupper', 'istitle'): meth = getattr(str, meth_name) diff --git a/Misc/NEWS.d/next/Library/2025-06-08-14-37-49.gh-issue-69456.KOAy89.rst b/Misc/NEWS.d/next/Library/2025-06-08-14-37-49.gh-issue-69456.KOAy89.rst new file mode 100644 index 00000000000000..11e170b3f78bc1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-06-08-14-37-49.gh-issue-69456.KOAy89.rst @@ -0,0 +1 @@ +Add :meth:`str.contains_surrogate` for surrogate detection. diff --git a/Objects/clinic/unicodeobject.c.h b/Objects/clinic/unicodeobject.c.h index 1819fbaea220a3..e09e3462e5d1c2 100644 --- a/Objects/clinic/unicodeobject.c.h +++ b/Objects/clinic/unicodeobject.c.h @@ -725,6 +725,24 @@ unicode_isprintable(PyObject *self, PyObject *Py_UNUSED(ignored)) return unicode_isprintable_impl(self); } +PyDoc_STRVAR(unicode_contains_surrogate__doc__, +"contains_surrogate($self, /)\n" +"--\n" +"\n" +"Return True if the string contains any surrogate code points, False otherwise."); + +#define UNICODE_CONTAINS_SURROGATE_METHODDEF \ + {"contains_surrogate", (PyCFunction)unicode_contains_surrogate, METH_NOARGS, unicode_contains_surrogate__doc__}, + +static PyObject * +unicode_contains_surrogate_impl(PyObject *self); + +static PyObject * +unicode_contains_surrogate(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + return unicode_contains_surrogate_impl(self); +} + PyDoc_STRVAR(unicode_join__doc__, "join($self, iterable, /)\n" "--\n" @@ -1908,4 +1926,4 @@ unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) exit: return return_value; } -/*[clinic end generated code: output=238917fe66120bde input=a9049054013a1b77]*/ +/*[clinic end generated code: output=1ff52eb2d684cb1e input=a9049054013a1b77]*/ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 5c2308a012142a..58a36a0f9932d0 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -12492,6 +12492,32 @@ unicode_isprintable_impl(PyObject *self) Py_RETURN_TRUE; } +/*[clinic input] +str.contains_surrogate as unicode_contains_surrogate + +Return True if the string contains any surrogate code points, False otherwise. + +[clinic start generated code]*/ + +static PyObject * +unicode_contains_surrogate_impl(PyObject *self) +/*[clinic end generated code: output=ec75cbb5265bd886 input=5853bb9f17fc5255]*/ +{ + Py_ssize_t i, len; + Py_UCS4 ch; + PyObject *unicode = self; + + len = PyUnicode_GET_LENGTH(unicode); + + for (i = 0; i < len; i++) { + ch = PyUnicode_READ_CHAR(unicode, i); + if (Py_UNICODE_IS_SURROGATE(ch)) { + Py_RETURN_TRUE; + } + } + Py_RETURN_FALSE; +} + /*[clinic input] str.join as unicode_join @@ -14489,6 +14515,7 @@ static PyMethodDef unicode_methods[] = { UNICODE_ISALNUM_METHODDEF UNICODE_ISIDENTIFIER_METHODDEF UNICODE_ISPRINTABLE_METHODDEF + UNICODE_CONTAINS_SURROGATE_METHODDEF UNICODE_ZFILL_METHODDEF {"format", _PyCFunction_CAST(do_string_format), METH_VARARGS | METH_KEYWORDS, format__doc__}, {"format_map", do_string_format_map, METH_O, format_map__doc__},