Skip to content

Commit 1237fb6

Browse files
authored
gh-80480: array: Add 'w' typecode. (#105242)
1 parent 5a5ed7a commit 1237fb6

File tree

7 files changed

+158
-58
lines changed

7 files changed

+158
-58
lines changed

Doc/faq/programming.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -924,12 +924,12 @@ module::
924924
'Hello, there!'
925925

926926
>>> import array
927-
>>> a = array.array('u', s)
927+
>>> a = array.array('w', s)
928928
>>> print(a)
929-
array('u', 'Hello, world')
929+
array('w', 'Hello, world')
930930
>>> a[0] = 'y'
931931
>>> print(a)
932-
array('u', 'yello, world')
932+
array('w', 'yello, world')
933933
>>> a.tounicode()
934934
'yello, world'
935935

Doc/library/array.rst

+11-7
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ defined:
2424
+-----------+--------------------+-------------------+-----------------------+-------+
2525
| ``'u'`` | wchar_t | Unicode character | 2 | \(1) |
2626
+-----------+--------------------+-------------------+-----------------------+-------+
27+
| ``'w'`` | Py_UCS4 | Unicode character | 4 | |
28+
+-----------+--------------------+-------------------+-----------------------+-------+
2729
| ``'h'`` | signed short | int | 2 | |
2830
+-----------+--------------------+-------------------+-----------------------+-------+
2931
| ``'H'`` | unsigned short | int | 2 | |
@@ -56,6 +58,7 @@ Notes:
5658
``Py_UNICODE`` is alias of ``wchar_t`` since Python 3.3.
5759

5860
.. deprecated-removed:: 3.3 4.0
61+
Please migrate to ``'w'`` typecode.
5962

6063

6164
The actual representation of values is determined by the machine architecture
@@ -174,9 +177,9 @@ The module defines the following type:
174177

175178
.. method:: fromunicode(s)
176179

177-
Extends this array with data from the given unicode string. The array must
178-
be a type ``'u'`` array; otherwise a :exc:`ValueError` is raised. Use
179-
``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
180+
Extends this array with data from the given unicode string.
181+
The array must have type code ``'u'`` or ``'w'``; otherwise a :exc:`ValueError` is raised.
182+
Use ``array.frombytes(unicodestring.encode(enc))`` to append Unicode data to an
180183
array of some other type.
181184

182185

@@ -236,21 +239,22 @@ The module defines the following type:
236239

237240
.. method:: tounicode()
238241

239-
Convert the array to a unicode string. The array must be a type ``'u'`` array;
242+
Convert the array to a unicode string. The array must have a type ``'u'`` or ``'w'``;
240243
otherwise a :exc:`ValueError` is raised. Use ``array.tobytes().decode(enc)`` to
241244
obtain a unicode string from an array of some other type.
242245

243246

244247
When an array object is printed or converted to a string, it is represented as
245248
``array(typecode, initializer)``. The *initializer* is omitted if the array is
246-
empty, otherwise it is a string if the *typecode* is ``'u'``, otherwise it is a
247-
list of numbers. The string is guaranteed to be able to be converted back to an
249+
empty, otherwise it is a string if the *typecode* is ``'u'`` or ``'w'``,
250+
otherwise it is a list of numbers.
251+
The string is guaranteed to be able to be converted back to an
248252
array with the same type and value using :func:`eval`, so long as the
249253
:class:`~array.array` class has been imported using ``from array import array``.
250254
Examples::
251255

252256
array('l')
253-
array('u', 'hello \u2641')
257+
array('w', 'hello \u2641')
254258
array('l', [1, 2, 3, 4, 5])
255259
array('d', [1.0, 2.0, 3.14])
256260

Doc/whatsnew/3.13.rst

+7
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,13 @@ New Modules
8787
Improved Modules
8888
================
8989

90+
array
91+
-----
92+
93+
* Add ``'w'`` type code that can be used for Unicode strings.
94+
It can be used instead of ``'u'`` type code, which is deprecated.
95+
(Contributed by Inada Naoki in :gh:`80480`.)
96+
9097
io
9198
--
9299

Lib/test/test_array.py

+28-21
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class ArraySubclassWithKwargs(array.array):
2727
def __init__(self, typecode, newarg=None):
2828
array.array.__init__(self)
2929

30-
typecodes = 'ubBhHiIlLfdqQ'
30+
typecodes = 'uwbBhHiIlLfdqQ'
3131

3232
class MiscTest(unittest.TestCase):
3333

@@ -186,11 +186,12 @@ def test_unicode(self):
186186
)
187187
for testcase in testcases:
188188
mformat_code, encoding = testcase
189-
a = array.array('u', teststr)
190-
b = array_reconstructor(
191-
array.array, 'u', mformat_code, teststr.encode(encoding))
192-
self.assertEqual(a, b,
193-
msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
189+
for c in 'uw':
190+
a = array.array(c, teststr)
191+
b = array_reconstructor(
192+
array.array, c, mformat_code, teststr.encode(encoding))
193+
self.assertEqual(a, b,
194+
msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase))
194195

195196

196197
class BaseTest:
@@ -234,7 +235,7 @@ def test_buffer_info(self):
234235
self.assertEqual(bi[1], len(a))
235236

236237
def test_byteswap(self):
237-
if self.typecode == 'u':
238+
if self.typecode in ('u', 'w'):
238239
example = '\U00100100'
239240
else:
240241
example = self.example
@@ -1079,7 +1080,7 @@ def test_buffer(self):
10791080
self.assertEqual(m.tobytes(), expected)
10801081
self.assertRaises(BufferError, a.frombytes, a.tobytes())
10811082
self.assertEqual(m.tobytes(), expected)
1082-
if self.typecode == 'u':
1083+
if self.typecode in ('u', 'w'):
10831084
self.assertRaises(BufferError, a.fromunicode, a.tounicode())
10841085
self.assertEqual(m.tobytes(), expected)
10851086
self.assertRaises(BufferError, operator.imul, a, 2)
@@ -1135,16 +1136,17 @@ def test_sizeof_without_buffer(self):
11351136
support.check_sizeof(self, a, basesize)
11361137

11371138
def test_initialize_with_unicode(self):
1138-
if self.typecode != 'u':
1139+
if self.typecode not in ('u', 'w'):
11391140
with self.assertRaises(TypeError) as cm:
11401141
a = array.array(self.typecode, 'foo')
11411142
self.assertIn("cannot use a str", str(cm.exception))
11421143
with self.assertRaises(TypeError) as cm:
1143-
a = array.array(self.typecode, array.array('u', 'foo'))
1144+
a = array.array(self.typecode, array.array('w', 'foo'))
11441145
self.assertIn("cannot use a unicode array", str(cm.exception))
11451146
else:
11461147
a = array.array(self.typecode, "foo")
11471148
a = array.array(self.typecode, array.array('u', 'foo'))
1149+
a = array.array(self.typecode, array.array('w', 'foo'))
11481150

11491151
@support.cpython_only
11501152
def test_obsolete_write_lock(self):
@@ -1171,40 +1173,45 @@ class UnicodeTest(StringTest, unittest.TestCase):
11711173
smallerexample = '\x01\u263a\x00\ufefe'
11721174
biggerexample = '\x01\u263a\x01\ufeff'
11731175
outside = str('\x33')
1174-
minitemsize = 2
1176+
minitemsize = sizeof_wchar
11751177

11761178
def test_unicode(self):
11771179
self.assertRaises(TypeError, array.array, 'b', 'foo')
11781180

1179-
a = array.array('u', '\xa0\xc2\u1234')
1181+
a = array.array(self.typecode, '\xa0\xc2\u1234')
11801182
a.fromunicode(' ')
11811183
a.fromunicode('')
11821184
a.fromunicode('')
11831185
a.fromunicode('\x11abc\xff\u1234')
11841186
s = a.tounicode()
11851187
self.assertEqual(s, '\xa0\xc2\u1234 \x11abc\xff\u1234')
1186-
self.assertEqual(a.itemsize, sizeof_wchar)
1188+
self.assertEqual(a.itemsize, self.minitemsize)
11871189

11881190
s = '\x00="\'a\\b\x80\xff\u0000\u0001\u1234'
1189-
a = array.array('u', s)
1191+
a = array.array(self.typecode, s)
11901192
self.assertEqual(
11911193
repr(a),
1192-
"array('u', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
1194+
f"array('{self.typecode}', '\\x00=\"\\'a\\\\b\\x80\xff\\x00\\x01\u1234')")
11931195

11941196
self.assertRaises(TypeError, a.fromunicode)
11951197

11961198
def test_issue17223(self):
1197-
# this used to crash
1198-
if sizeof_wchar == 4:
1199-
# U+FFFFFFFF is an invalid code point in Unicode 6.0
1200-
invalid_str = b'\xff\xff\xff\xff'
1201-
else:
1199+
if self.typecode == 'u' and sizeof_wchar == 2:
12021200
# PyUnicode_FromUnicode() cannot fail with 16-bit wchar_t
12031201
self.skipTest("specific to 32-bit wchar_t")
1204-
a = array.array('u', invalid_str)
1202+
1203+
# this used to crash
1204+
# U+FFFFFFFF is an invalid code point in Unicode 6.0
1205+
invalid_str = b'\xff\xff\xff\xff'
1206+
1207+
a = array.array(self.typecode, invalid_str)
12051208
self.assertRaises(ValueError, a.tounicode)
12061209
self.assertRaises(ValueError, str, a)
12071210

1211+
class UCS4Test(UnicodeTest):
1212+
typecode = 'w'
1213+
minitemsize = 4
1214+
12081215
class NumberTest(BaseTest):
12091216

12101217
def test_extslice(self):

Lib/test/test_csv.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -955,7 +955,7 @@ def test_float_write(self):
955955

956956
def test_char_write(self):
957957
import array, string
958-
a = array.array('u', string.ascii_letters)
958+
a = array.array('w', string.ascii_letters)
959959

960960
with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj:
961961
writer = csv.writer(fileobj, dialect="excel")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
:mod:`array`: Add ``'w'`` typecode that represents ``Py_UCS4``.

0 commit comments

Comments
 (0)