Skip to content

gh-80480: array: Add 'w' typecode. #105242

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Doc/faq/programming.rst
Original file line number Diff line number Diff line change
Expand Up @@ -924,12 +924,12 @@ module::
'Hello, there!'

>>> import array
>>> a = array.array('u', s)
>>> a = array.array('w', s)
>>> print(a)
array('u', 'Hello, world')
array('w', 'Hello, world')
>>> a[0] = 'y'
>>> print(a)
array('u', 'yello, world')
array('w', 'yello, world')
>>> a.tounicode()
'yello, world'

Expand Down
18 changes: 11 additions & 7 deletions Doc/library/array.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ defined:
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'u'`` | wchar_t | Unicode character | 2 | \(1) |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'w'`` | Py_UCS4 | Unicode character | 4 | |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'h'`` | signed short | int | 2 | |
+-----------+--------------------+-------------------+-----------------------+-------+
| ``'H'`` | unsigned short | int | 2 | |
Expand Down Expand Up @@ -56,6 +58,7 @@ Notes:
``Py_UNICODE`` is alias of ``wchar_t`` since Python 3.3.

.. deprecated-removed:: 3.3 4.0
Please migrate to ``'w'`` typecode.


The actual representation of values is determined by the machine architecture
Expand Down Expand Up @@ -174,9 +177,9 @@ The module defines the following type:

.. method:: fromunicode(s)

Extends this array with data from the given unicode string. The array must
be a type ``'u'`` array; otherwise a :exc:`ValueError` is raised. Use
``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
Extends this array with data from the given unicode string.
The array must have type code ``'u'`` or ``'w'``; otherwise a :exc:`ValueError` is raised.
Use ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
array of some other type.


Expand Down Expand Up @@ -236,21 +239,22 @@ The module defines the following type:

.. method:: tounicode()

Convert the array to a unicode string. The array must be a type ``'u'`` array;
Convert the array to a unicode string. The array must have a type ``'u'`` or ``'w'``;
otherwise a :exc:`ValueError` is raised. Use ``array.tobytes().decode(enc)`` to
obtain a unicode string from an array of some other type.


When an array object is printed or converted to a string, it is represented as
``array(typecode, initializer)``. The *initializer* is omitted if the array is
empty, otherwise it is a string if the *typecode* is ``'u'``, otherwise it is a
list of numbers. The string is guaranteed to be able to be converted back to an
empty, otherwise it is a string if the *typecode* is ``'u'`` or ``'w'``,
otherwise it is a list of numbers.
The string is guaranteed to be able to be converted back to an
array with the same type and value using :func:`eval`, so long as the
:class:`~array.array` class has been imported using ``from array import array``.
Examples::

array('l')
array('u', 'hello \u2641')
array('w', 'hello \u2641')
array('l', [1, 2, 3, 4, 5])
array('d', [1.0, 2.0, 3.14])

Expand Down
7 changes: 7 additions & 0 deletions Doc/whatsnew/3.13.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,13 @@ New Modules
Improved Modules
================

array
-----

* Add ``'w'`` type code that can be used for Unicode strings.
It can be used instead of ``'u'`` type code, which is deprecated.
(Contributed by Inada Naoki in :gh:`80480`.)

io
--

Expand Down
49 changes: 28 additions & 21 deletions Lib/test/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class ArraySubclassWithKwargs(array.array):
def __init__(self, typecode, newarg=None):
array.array.__init__(self)

typecodes = 'ubBhHiIlLfdqQ'
typecodes = 'uwbBhHiIlLfdqQ'

class MiscTest(unittest.TestCase):

Expand Down Expand Up @@ -186,11 +186,12 @@ def test_unicode(self):
)
for testcase in testcases:
mformat_code, encoding = testcase
a = array.array('u', teststr)
b = array_reconstructor(
array.array, 'u', mformat_code, teststr.encode(encoding))
self.assertEqual(a, b,
msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
for c in 'uw':
a = array.array(c, teststr)
b = array_reconstructor(
array.array, c, mformat_code, teststr.encode(encoding))
self.assertEqual(a, b,
msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))


class BaseTest:
Expand Down Expand Up @@ -234,7 +235,7 @@ def test_buffer_info(self):
self.assertEqual(bi[1], len(a))

def test_byteswap(self):
if self.typecode == 'u':
if self.typecode in ('u', 'w'):
example = '\U00100100'
else:
example = self.example
Expand Down Expand Up @@ -1079,7 +1080,7 @@ def test_buffer(self):
self.assertEqual(m.tobytes(), expected)
self.assertRaises(BufferError, a.frombytes, a.tobytes())
self.assertEqual(m.tobytes(), expected)
if self.typecode == 'u':
if self.typecode in ('u', 'w'):
self.assertRaises(BufferError, a.fromunicode, a.tounicode())
self.assertEqual(m.tobytes(), expected)
self.assertRaises(BufferError, operator.imul, a, 2)
Expand Down Expand Up @@ -1135,16 +1136,17 @@ def test_sizeof_without_buffer(self):
support.check_sizeof(self, a, basesize)

def test_initialize_with_unicode(self):
if self.typecode != 'u':
if self.typecode not in ('u', 'w'):
with self.assertRaises(TypeError) as cm:
a = array.array(self.typecode, 'foo')
self.assertIn("cannot use a str", str(cm.exception))
with self.assertRaises(TypeError) as cm:
a = array.array(self.typecode, array.array('u', 'foo'))
a = array.array(self.typecode, array.array('w', 'foo'))
self.assertIn("cannot use a unicode array", str(cm.exception))
else:
a = array.array(self.typecode, "foo")
a = array.array(self.typecode, array.array('u', 'foo'))
a = array.array(self.typecode, array.array('w', 'foo'))

@support.cpython_only
def test_obsolete_write_lock(self):
Expand All @@ -1171,40 +1173,45 @@ class UnicodeTest(StringTest, unittest.TestCase):
smallerexample = '\x01\u263a\x00\ufefe'
biggerexample = '\x01\u263a\x01\ufeff'
outside = str('\x33')
minitemsize = 2
minitemsize = sizeof_wchar

def test_unicode(self):
self.assertRaises(TypeError, array.array, 'b', 'foo')

a = array.array('u', '\xa0\xc2\u1234')
a = array.array(self.typecode, '\xa0\xc2\u1234')
a.fromunicode(' ')
a.fromunicode('')
a.fromunicode('')
a.fromunicode('\x11abc\xff\u1234')
s = a.tounicode()
self.assertEqual(s, '\xa0\xc2\u1234 \x11abc\xff\u1234')
self.assertEqual(a.itemsize, sizeof_wchar)
self.assertEqual(a.itemsize, self.minitemsize)

s = '\x00="\'a\\b\x80\xff\u0000\u0001\u1234'
a = array.array('u', s)
a = array.array(self.typecode, s)
self.assertEqual(
repr(a),
"array('u', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
f"array('{self.typecode}', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")

self.assertRaises(TypeError, a.fromunicode)

def test_issue17223(self):
# this used to crash
if sizeof_wchar == 4:
# U+FFFFFFFF is an invalid code point in Unicode 6.0
invalid_str = b'\xff\xff\xff\xff'
else:
if self.typecode == 'u' and sizeof_wchar == 2:
# PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t
self.skipTest("specific to 32-bit wchar_t")
a = array.array('u', invalid_str)

# this used to crash
# U+FFFFFFFF is an invalid code point in Unicode 6.0
invalid_str = b'\xff\xff\xff\xff'

a = array.array(self.typecode, invalid_str)
self.assertRaises(ValueError, a.tounicode)
self.assertRaises(ValueError, str, a)

class UCS4Test(UnicodeTest):
typecode = 'w'
minitemsize = 4

class NumberTest(BaseTest):

def test_extslice(self):
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -955,7 +955,7 @@ def test_float_write(self):

def test_char_write(self):
import array, string
a = array.array('u', string.ascii_letters)
a = array.array('w', string.ascii_letters)

with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj:
writer = csv.writer(fileobj, dialect="excel")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:mod:`array`: Add ``'w'`` typecode that represents ``Py_UCS4``.
Loading