Skip to content

Commit c3bb32d

Browse files
[3.12] gh-124008: Fix calculation of the number of written bytes for the Windows console (GH-124059) (GH-127326)
Since MultiByteToWideChar()/WideCharToMultiByte() is not reversible if the data contains invalid UTF-8 sequences, use binary search to calculate the number of written bytes from the number of written characters. Also fix writing incomplete UTF-8 sequences. Also fix handling of memory allocation failures. (cherry picked from commit 3cf83d9) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent 97ed216 commit c3bb32d

File tree

3 files changed

+115
-28
lines changed

3 files changed

+115
-28
lines changed

Lib/test/test_winconsoleio.py

+23
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,29 @@ def test_write_empty_data(self):
126126
with ConIO('CONOUT$', 'w') as f:
127127
self.assertEqual(f.write(b''), 0)
128128

129+
@requires_resource('console')
130+
def test_write(self):
131+
testcases = []
132+
with ConIO('CONOUT$', 'w') as f:
133+
for a in [
134+
b'',
135+
b'abc',
136+
b'\xc2\xa7\xe2\x98\x83\xf0\x9f\x90\x8d',
137+
b'\xff'*10,
138+
]:
139+
for b in b'\xc2\xa7', b'\xe2\x98\x83', b'\xf0\x9f\x90\x8d':
140+
testcases.append(a + b)
141+
for i in range(1, len(b)):
142+
data = a + b[:i]
143+
testcases.append(data + b'z')
144+
testcases.append(data + b'\xff')
145+
# incomplete multibyte sequence
146+
with self.subTest(data=data):
147+
self.assertEqual(f.write(data), len(a))
148+
for data in testcases:
149+
with self.subTest(data=data):
150+
self.assertEqual(f.write(data), len(data))
151+
129152
def assertStdinRoundTrip(self, text):
130153
stdin = open('CONIN$', 'r')
131154
old_stdin = sys.stdin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix possible crash (in debug build), incorrect output or returning incorrect
2+
value from raw binary ``write()`` when writing to console on Windows.

Modules/_io/winconsoleio.c

+90-28
Original file line numberDiff line numberDiff line change
@@ -135,19 +135,67 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
135135
}
136136

137137
static DWORD
138-
_find_last_utf8_boundary(const char *buf, DWORD len)
138+
_find_last_utf8_boundary(const unsigned char *buf, DWORD len)
139139
{
140-
/* This function never returns 0, returns the original len instead */
141-
DWORD count = 1;
142-
if (len == 0 || (buf[len - 1] & 0x80) == 0) {
143-
return len;
144-
}
145-
for (;; count++) {
146-
if (count > 3 || count >= len) {
140+
for (DWORD count = 1; count < 4 && count <= len; count++) {
141+
unsigned char c = buf[len - count];
142+
if (c < 0x80) {
143+
/* No starting byte found. */
147144
return len;
148145
}
149-
if ((buf[len - count] & 0xc0) != 0x80) {
150-
return len - count;
146+
if (c >= 0xc0) {
147+
if (c < 0xe0 /* 2-bytes sequence */ ? count < 2 :
148+
c < 0xf0 /* 3-bytes sequence */ ? count < 3 :
149+
c < 0xf8 /* 4-bytes sequence */)
150+
{
151+
/* Incomplete multibyte sequence. */
152+
return len - count;
153+
}
154+
/* Either complete or invalid sequence. */
155+
return len;
156+
}
157+
}
158+
/* Either complete 4-bytes sequence or invalid sequence. */
159+
return len;
160+
}
161+
162+
/* Find the number of UTF-8 bytes that corresponds to the specified number of
163+
* wchars.
164+
* I.e. find x <= len so that MultiByteToWideChar(CP_UTF8, 0, s, x, NULL, 0) == n.
165+
*
166+
* WideCharToMultiByte() cannot be used for this, because the UTF-8 -> wchar
167+
* conversion is not reversible (invalid UTF-8 byte produces \ufffd which
168+
* will be converted back to 3-bytes UTF-8 sequence \xef\xbf\xbd).
169+
* So we need to use binary search.
170+
*/
171+
static DWORD
172+
_wchar_to_utf8_count(const unsigned char *s, DWORD len, DWORD n)
173+
{
174+
DWORD start = 0;
175+
while (1) {
176+
DWORD mid = 0;
177+
for (DWORD i = len / 2; i <= len; i++) {
178+
mid = _find_last_utf8_boundary(s, i);
179+
if (mid != 0) {
180+
break;
181+
}
182+
/* The middle could split the first multibytes sequence. */
183+
}
184+
if (mid == len) {
185+
return start + len;
186+
}
187+
if (mid == 0) {
188+
mid = len > 1 ? len - 1 : 1;
189+
}
190+
DWORD wlen = MultiByteToWideChar(CP_UTF8, 0, s, mid, NULL, 0);
191+
if (wlen <= n) {
192+
s += mid;
193+
start += mid;
194+
len -= mid;
195+
n -= wlen;
196+
}
197+
else {
198+
len = mid;
151199
}
152200
}
153201
}
@@ -556,8 +604,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
556604
int err = 0, sig = 0;
557605

558606
wchar_t *buf = (wchar_t*)PyMem_Malloc(maxlen * sizeof(wchar_t));
559-
if (!buf)
607+
if (!buf) {
608+
PyErr_NoMemory();
560609
goto error;
610+
}
561611

562612
*readlen = 0;
563613

@@ -615,6 +665,7 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
615665
Py_UNBLOCK_THREADS
616666
if (!newbuf) {
617667
sig = -1;
668+
PyErr_NoMemory();
618669
break;
619670
}
620671
buf = newbuf;
@@ -638,8 +689,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
638689
if (*readlen > 0 && buf[0] == L'\x1a') {
639690
PyMem_Free(buf);
640691
buf = (wchar_t *)PyMem_Malloc(sizeof(wchar_t));
641-
if (!buf)
692+
if (!buf) {
693+
PyErr_NoMemory();
642694
goto error;
695+
}
643696
buf[0] = L'\0';
644697
*readlen = 0;
645698
}
@@ -817,8 +870,10 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
817870
bufsize = BUFSIZ;
818871

819872
buf = (wchar_t*)PyMem_Malloc((bufsize + 1) * sizeof(wchar_t));
820-
if (buf == NULL)
873+
if (buf == NULL) {
874+
PyErr_NoMemory();
821875
return NULL;
876+
}
822877

823878
while (1) {
824879
wchar_t *subbuf;
@@ -840,6 +895,7 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
840895
(bufsize + 1) * sizeof(wchar_t));
841896
if (tmp == NULL) {
842897
PyMem_Free(buf);
898+
PyErr_NoMemory();
843899
return NULL;
844900
}
845901
buf = tmp;
@@ -1015,43 +1071,49 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
10151071
len = (DWORD)b->len;
10161072

10171073
Py_BEGIN_ALLOW_THREADS
1018-
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
1019-
10201074
/* issue11395 there is an unspecified upper bound on how many bytes
10211075
can be written at once. We cap at 32k - the caller will have to
10221076
handle partial writes.
10231077
Since we don't know how many input bytes are being ignored, we
10241078
have to reduce and recalculate. */
1025-
while (wlen > 32766 / sizeof(wchar_t)) {
1026-
len /= 2;
1079+
const DWORD max_wlen = 32766 / sizeof(wchar_t);
1080+
/* UTF-8 to wchar ratio is at most 3:1. */
1081+
len = Py_MIN(len, max_wlen * 3);
1082+
while (1) {
10271083
/* Fix for github issues gh-110913 and gh-82052. */
10281084
len = _find_last_utf8_boundary(b->buf, len);
10291085
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, NULL, 0);
1086+
if (wlen <= max_wlen) {
1087+
break;
1088+
}
1089+
len /= 2;
10301090
}
10311091
Py_END_ALLOW_THREADS
10321092

1033-
if (!wlen)
1034-
return PyErr_SetFromWindowsErr(0);
1093+
if (!wlen) {
1094+
return PyLong_FromLong(0);
1095+
}
10351096

10361097
wbuf = (wchar_t*)PyMem_Malloc(wlen * sizeof(wchar_t));
1098+
if (!wbuf) {
1099+
PyErr_NoMemory();
1100+
return NULL;
1101+
}
10371102

10381103
Py_BEGIN_ALLOW_THREADS
10391104
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len, wbuf, wlen);
10401105
if (wlen) {
10411106
res = WriteConsoleW(handle, wbuf, wlen, &n, NULL);
1107+
#ifdef Py_DEBUG
1108+
if (res) {
1109+
#else
10421110
if (res && n < wlen) {
1111+
#endif
10431112
/* Wrote fewer characters than expected, which means our
10441113
* len value may be wrong. So recalculate it from the
1045-
* characters that were written. As this could potentially
1046-
* result in a different value, we also validate that value.
1114+
* characters that were written.
10471115
*/
1048-
len = WideCharToMultiByte(CP_UTF8, 0, wbuf, n,
1049-
NULL, 0, NULL, NULL);
1050-
if (len) {
1051-
wlen = MultiByteToWideChar(CP_UTF8, 0, b->buf, len,
1052-
NULL, 0);
1053-
assert(wlen == len);
1054-
}
1116+
len = _wchar_to_utf8_count(b->buf, len, n);
10551117
}
10561118
} else
10571119
res = 0;

0 commit comments

Comments
 (0)