@@ -135,19 +135,67 @@ char _PyIO_get_console_type(PyObject *path_or_fd) {
135
135
}
136
136
137
137
static DWORD
138
- _find_last_utf8_boundary (const char * buf , DWORD len )
138
+ _find_last_utf8_boundary (const unsigned char * buf , DWORD len )
139
139
{
140
- /* This function never returns 0, returns the original len instead */
141
- DWORD count = 1 ;
142
- if (len == 0 || (buf [len - 1 ] & 0x80 ) == 0 ) {
143
- return len ;
144
- }
145
- for (;; count ++ ) {
146
- if (count > 3 || count >= len ) {
140
+ for (DWORD count = 1 ; count < 4 && count <= len ; count ++ ) {
141
+ unsigned char c = buf [len - count ];
142
+ if (c < 0x80 ) {
143
+ /* No starting byte found. */
147
144
return len ;
148
145
}
149
- if ((buf [len - count ] & 0xc0 ) != 0x80 ) {
150
- return len - count ;
146
+ if (c >= 0xc0 ) {
147
+ if (c < 0xe0 /* 2-bytes sequence */ ? count < 2 :
148
+ c < 0xf0 /* 3-bytes sequence */ ? count < 3 :
149
+ c < 0xf8 /* 4-bytes sequence */ )
150
+ {
151
+ /* Incomplete multibyte sequence. */
152
+ return len - count ;
153
+ }
154
+ /* Either complete or invalid sequence. */
155
+ return len ;
156
+ }
157
+ }
158
+ /* Either complete 4-bytes sequence or invalid sequence. */
159
+ return len ;
160
+ }
161
+
162
+ /* Find the number of UTF-8 bytes that corresponds to the specified number of
163
+ * wchars.
164
+ * I.e. find x <= len so that MultiByteToWideChar(CP_UTF8, 0, s, x, NULL, 0) == n.
165
+ *
166
+ * WideCharToMultiByte() cannot be used for this, because the UTF-8 -> wchar
167
+ * conversion is not reversible (invalid UTF-8 byte produces \ufffd which
168
+ * will be converted back to 3-bytes UTF-8 sequence \xef\xbf\xbd).
169
+ * So we need to use binary search.
170
+ */
171
+ static DWORD
172
+ _wchar_to_utf8_count (const unsigned char * s , DWORD len , DWORD n )
173
+ {
174
+ DWORD start = 0 ;
175
+ while (1 ) {
176
+ DWORD mid = 0 ;
177
+ for (DWORD i = len / 2 ; i <= len ; i ++ ) {
178
+ mid = _find_last_utf8_boundary (s , i );
179
+ if (mid != 0 ) {
180
+ break ;
181
+ }
182
+ /* The middle could split the first multibytes sequence. */
183
+ }
184
+ if (mid == len ) {
185
+ return start + len ;
186
+ }
187
+ if (mid == 0 ) {
188
+ mid = len > 1 ? len - 1 : 1 ;
189
+ }
190
+ DWORD wlen = MultiByteToWideChar (CP_UTF8 , 0 , s , mid , NULL , 0 );
191
+ if (wlen <= n ) {
192
+ s += mid ;
193
+ start += mid ;
194
+ len -= mid ;
195
+ n -= wlen ;
196
+ }
197
+ else {
198
+ len = mid ;
151
199
}
152
200
}
153
201
}
@@ -556,8 +604,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
556
604
int err = 0 , sig = 0 ;
557
605
558
606
wchar_t * buf = (wchar_t * )PyMem_Malloc (maxlen * sizeof (wchar_t ));
559
- if (!buf )
607
+ if (!buf ) {
608
+ PyErr_NoMemory ();
560
609
goto error ;
610
+ }
561
611
562
612
* readlen = 0 ;
563
613
@@ -615,6 +665,7 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
615
665
Py_UNBLOCK_THREADS
616
666
if (!newbuf ) {
617
667
sig = -1 ;
668
+ PyErr_NoMemory ();
618
669
break ;
619
670
}
620
671
buf = newbuf ;
@@ -638,8 +689,10 @@ read_console_w(HANDLE handle, DWORD maxlen, DWORD *readlen) {
638
689
if (* readlen > 0 && buf [0 ] == L'\x1a' ) {
639
690
PyMem_Free (buf );
640
691
buf = (wchar_t * )PyMem_Malloc (sizeof (wchar_t ));
641
- if (!buf )
692
+ if (!buf ) {
693
+ PyErr_NoMemory ();
642
694
goto error ;
695
+ }
643
696
buf [0 ] = L'\0' ;
644
697
* readlen = 0 ;
645
698
}
@@ -817,8 +870,10 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
817
870
bufsize = BUFSIZ ;
818
871
819
872
buf = (wchar_t * )PyMem_Malloc ((bufsize + 1 ) * sizeof (wchar_t ));
820
- if (buf == NULL )
873
+ if (buf == NULL ) {
874
+ PyErr_NoMemory ();
821
875
return NULL ;
876
+ }
822
877
823
878
while (1 ) {
824
879
wchar_t * subbuf ;
@@ -840,6 +895,7 @@ _io__WindowsConsoleIO_readall_impl(winconsoleio *self)
840
895
(bufsize + 1 ) * sizeof (wchar_t ));
841
896
if (tmp == NULL ) {
842
897
PyMem_Free (buf );
898
+ PyErr_NoMemory ();
843
899
return NULL ;
844
900
}
845
901
buf = tmp ;
@@ -1015,43 +1071,49 @@ _io__WindowsConsoleIO_write_impl(winconsoleio *self, PyTypeObject *cls,
1015
1071
len = (DWORD )b -> len ;
1016
1072
1017
1073
Py_BEGIN_ALLOW_THREADS
1018
- wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1019
-
1020
1074
/* issue11395 there is an unspecified upper bound on how many bytes
1021
1075
can be written at once. We cap at 32k - the caller will have to
1022
1076
handle partial writes.
1023
1077
Since we don't know how many input bytes are being ignored, we
1024
1078
have to reduce and recalculate. */
1025
- while (wlen > 32766 / sizeof (wchar_t )) {
1026
- len /= 2 ;
1079
+ const DWORD max_wlen = 32766 / sizeof (wchar_t );
1080
+ /* UTF-8 to wchar ratio is at most 3:1. */
1081
+ len = Py_MIN (len , max_wlen * 3 );
1082
+ while (1 ) {
1027
1083
/* Fix for github issues gh-110913 and gh-82052. */
1028
1084
len = _find_last_utf8_boundary (b -> buf , len );
1029
1085
wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , NULL , 0 );
1086
+ if (wlen <= max_wlen ) {
1087
+ break ;
1088
+ }
1089
+ len /= 2 ;
1030
1090
}
1031
1091
Py_END_ALLOW_THREADS
1032
1092
1033
- if (!wlen )
1034
- return PyErr_SetFromWindowsErr (0 );
1093
+ if (!wlen ) {
1094
+ return PyLong_FromLong (0 );
1095
+ }
1035
1096
1036
1097
wbuf = (wchar_t * )PyMem_Malloc (wlen * sizeof (wchar_t ));
1098
+ if (!wbuf ) {
1099
+ PyErr_NoMemory ();
1100
+ return NULL ;
1101
+ }
1037
1102
1038
1103
Py_BEGIN_ALLOW_THREADS
1039
1104
wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len , wbuf , wlen );
1040
1105
if (wlen ) {
1041
1106
res = WriteConsoleW (handle , wbuf , wlen , & n , NULL );
1107
+ #ifdef Py_DEBUG
1108
+ if (res ) {
1109
+ #else
1042
1110
if (res && n < wlen ) {
1111
+ #endif
1043
1112
/* Wrote fewer characters than expected, which means our
1044
1113
* len value may be wrong. So recalculate it from the
1045
- * characters that were written. As this could potentially
1046
- * result in a different value, we also validate that value.
1114
+ * characters that were written.
1047
1115
*/
1048
- len = WideCharToMultiByte (CP_UTF8 , 0 , wbuf , n ,
1049
- NULL , 0 , NULL , NULL );
1050
- if (len ) {
1051
- wlen = MultiByteToWideChar (CP_UTF8 , 0 , b -> buf , len ,
1052
- NULL , 0 );
1053
- assert (wlen == len );
1054
- }
1116
+ len = _wchar_to_utf8_count (b -> buf , len , n );
1055
1117
}
1056
1118
} else
1057
1119
res = 0 ;
0 commit comments