From 4625289ca6cfc9c90f43953cd98a39ee5791ca7c Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Thu, 5 Jun 2025 22:42:14 +0900 Subject: [PATCH 1/3] Add UTF-32 functions --- Lib/_pycodecs.py | 1491 ++++++++++++++++++++++++-------------- Lib/test/string_tests.py | 2 - Lib/test/test_array.py | 2 - vm/src/stdlib/codecs.rs | 29 +- 4 files changed, 982 insertions(+), 542 deletions(-) diff --git a/Lib/_pycodecs.py b/Lib/_pycodecs.py index d0efa9ad6b..48483f5772 100644 --- a/Lib/_pycodecs.py +++ b/Lib/_pycodecs.py @@ -22,10 +22,10 @@ The builtin Unicode codecs use the following interface: - _encode(Unicode_object[,errors='strict']) -> + _encode(Unicode_object[,errors='strict']) -> (string object, bytes consumed) - _decode(char_buffer_obj[,errors='strict']) -> + _decode(char_buffer_obj[,errors='strict']) -> (Unicode object, bytes consumed) _encode() interfaces also accept non-Unicode object as @@ -44,47 +44,82 @@ From PyPy v1.0.0 """ -#from unicodecodec import * - -__all__ = ['register', 'lookup', 'lookup_error', 'register_error', 'encode', 'decode', - 'latin_1_encode', 'mbcs_decode', 'readbuffer_encode', 'escape_encode', - 'utf_8_decode', 'raw_unicode_escape_decode', 'utf_7_decode', - 'unicode_escape_encode', 'latin_1_decode', 'utf_16_decode', - 'unicode_escape_decode', 'ascii_decode', 'charmap_encode', 'charmap_build', - 'unicode_internal_encode', 'unicode_internal_decode', 'utf_16_ex_decode', - 'escape_decode', 'charmap_decode', 'utf_7_encode', 'mbcs_encode', - 'ascii_encode', 'utf_16_encode', 'raw_unicode_escape_encode', 'utf_8_encode', - 'utf_16_le_encode', 'utf_16_be_encode', 'utf_16_le_decode', 'utf_16_be_decode',] +# from unicodecodec import * + +__all__ = [ + "register", + "lookup", + "lookup_error", + "register_error", + "encode", + "decode", + "latin_1_encode", + "mbcs_decode", + "readbuffer_encode", + "escape_encode", + "utf_8_decode", + "raw_unicode_escape_decode", + "utf_7_decode", + "unicode_escape_encode", + "latin_1_decode", + "utf_16_decode", + "unicode_escape_decode", + "ascii_decode", + "charmap_encode", + "charmap_build", + "unicode_internal_encode", + "unicode_internal_decode", + "utf_16_ex_decode", + "escape_decode", + "charmap_decode", + "utf_7_encode", + "mbcs_encode", + "ascii_encode", + "utf_16_encode", + "raw_unicode_escape_encode", + "utf_8_encode", + "utf_16_le_encode", + "utf_16_be_encode", + "utf_16_le_decode", + "utf_16_be_decode", + "utf_32_encode", + "utf_32_decode", + "utf_32_le_encode", + "utf_32_le_decode", + "utf_32_be_encode", + "utf_32_be_decode", + "utf_32_ex_decode", +] import sys import warnings from _codecs import * -def latin_1_encode( obj, errors='strict'): - """None - """ +def latin_1_encode(obj, errors="strict"): + """None""" res = PyUnicode_EncodeLatin1(obj, len(obj), errors) res = bytes(res) return res, len(obj) + + # XXX MBCS codec might involve ctypes ? def mbcs_decode(): - """None - """ + """None""" pass -def readbuffer_encode( obj, errors='strict'): - """None - """ + +def readbuffer_encode(obj, errors="strict"): + """None""" if isinstance(obj, str): res = obj.encode() else: res = bytes(obj) return res, len(obj) -def escape_encode( obj, errors='strict'): - """None - """ + +def escape_encode(obj, errors="strict"): + """None""" if not isinstance(obj, bytes): raise TypeError("must be bytes") s = repr(obj).encode() @@ -93,85 +128,88 @@ def escape_encode( obj, errors='strict'): v = v.replace(b"'", b"\\'").replace(b'\\"', b'"') return v, len(obj) -def raw_unicode_escape_decode( data, errors='strict', final=False): - """None - """ + +def raw_unicode_escape_decode(data, errors="strict", final=False): + """None""" res = PyUnicode_DecodeRawUnicodeEscape(data, len(data), errors, final) - res = ''.join(res) + res = "".join(res) return res, len(data) -def utf_7_decode( data, errors='strict'): - """None - """ + +def utf_7_decode(data, errors="strict"): + """None""" res = PyUnicode_DecodeUTF7(data, len(data), errors) - res = ''.join(res) + res = "".join(res) return res, len(data) -def unicode_escape_encode( obj, errors='strict'): - """None - """ + +def unicode_escape_encode(obj, errors="strict"): + """None""" res = unicodeescape_string(obj, len(obj), 0) - res = b''.join(res) + res = b"".join(res) return res, len(obj) -def latin_1_decode( data, errors='strict'): - """None - """ + +def latin_1_decode(data, errors="strict"): + """None""" res = PyUnicode_DecodeLatin1(data, len(data), errors) - res = ''.join(res) + res = "".join(res) return res, len(data) -def utf_16_decode( data, errors='strict', final=False): - """None - """ + +def utf_16_decode(data, errors="strict", final=False): + """None""" consumed = len(data) if final: consumed = 0 - res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'native', final) - res = ''.join(res) + res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful( + data, len(data), errors, "native", final + ) + res = "".join(res) return res, consumed -def unicode_escape_decode( data, errors='strict', final=False): - """None - """ + +def unicode_escape_decode(data, errors="strict", final=False): + """None""" res = PyUnicode_DecodeUnicodeEscape(data, len(data), errors, final) - res = ''.join(res) + res = "".join(res) return res, len(data) -def ascii_decode( data, errors='strict'): - """None - """ +def ascii_decode(data, errors="strict"): + """None""" res = PyUnicode_DecodeASCII(data, len(data), errors) - res = ''.join(res) + res = "".join(res) return res, len(data) -def charmap_encode(obj, errors='strict', mapping='latin-1'): - """None - """ + +def charmap_encode(obj, errors="strict", mapping="latin-1"): + """None""" res = PyUnicode_EncodeCharmap(obj, len(obj), mapping, errors) res = bytes(res) return res, len(obj) + def charmap_build(s): return {ord(c): i for i, c in enumerate(s)} + if sys.maxunicode == 65535: unicode_bytes = 2 else: unicode_bytes = 4 -def unicode_internal_encode( obj, errors='strict'): - """None - """ + +def unicode_internal_encode(obj, errors="strict"): + """None""" if type(obj) == str: p = bytearray() t = [ord(x) for x in obj] for i in t: b = bytearray() for j in range(unicode_bytes): - b.append(i%256) + b.append(i % 256) i >>= 8 if sys.byteorder == "big": b.reverse() @@ -179,12 +217,12 @@ def unicode_internal_encode( obj, errors='strict'): res = bytes(p) return res, len(res) else: - res = "You can do better than this" # XXX make this right + res = "You can do better than this" # XXX make this right return res, len(res) -def unicode_internal_decode( unistr, errors='strict'): - """None - """ + +def unicode_internal_decode(unistr, errors="strict"): + """None""" if type(unistr) == str: return unistr, len(unistr) else: @@ -198,75 +236,76 @@ def unicode_internal_decode( unistr, errors='strict'): start = 0 stop = unicode_bytes step = 1 - while i < len(unistr)-unicode_bytes+1: + while i < len(unistr) - unicode_bytes + 1: t = 0 h = 0 for j in range(start, stop, step): - t += ord(unistr[i+j])<<(h*8) + t += ord(unistr[i + j]) << (h * 8) h += 1 i += unicode_bytes p += chr(t) - res = ''.join(p) + res = "".join(p) return res, len(res) -def utf_16_ex_decode( data, errors='strict', byteorder=0, final=0): - """None - """ + +def utf_16_ex_decode(data, errors="strict", byteorder=0, final=0): + """None""" if byteorder == 0: - bm = 'native' + bm = "native" elif byteorder == -1: - bm = 'little' + bm = "little" else: - bm = 'big' + bm = "big" consumed = len(data) if final: consumed = 0 - res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, bm, final) - res = ''.join(res) + res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful( + data, len(data), errors, bm, final + ) + res = "".join(res) return res, consumed, byteorder + # XXX needs error messages when the input is invalid -def escape_decode(data, errors='strict'): - """None - """ +def escape_decode(data, errors="strict"): + """None""" l = len(data) i = 0 res = bytearray() while i < l: - - if data[i] == '\\': + if data[i] == "\\": i += 1 if i >= l: raise ValueError("Trailing \\ in string") else: - if data[i] == '\\': - res += b'\\' - elif data[i] == 'n': - res += b'\n' - elif data[i] == 't': - res += b'\t' - elif data[i] == 'r': - res += b'\r' - elif data[i] == 'b': - res += b'\b' - elif data[i] == '\'': - res += b'\'' - elif data[i] == '\"': - res += b'\"' - elif data[i] == 'f': - res += b'\f' - elif data[i] == 'a': - res += b'\a' - elif data[i] == 'v': - res += b'\v' - elif '0' <= data[i] <= '9': + if data[i] == "\\": + res += b"\\" + elif data[i] == "n": + res += b"\n" + elif data[i] == "t": + res += b"\t" + elif data[i] == "r": + res += b"\r" + elif data[i] == "b": + res += b"\b" + elif data[i] == "'": + res += b"'" + elif data[i] == '"': + res += b'"' + elif data[i] == "f": + res += b"\f" + elif data[i] == "a": + res += b"\a" + elif data[i] == "v": + res += b"\v" + elif "0" <= data[i] <= "9": # emulate a strange wrap-around behavior of CPython: # \400 is the same as \000 because 0400 == 256 - octal = data[i:i+3] + octal = data[i : i + 3] res.append(int(octal, 8) & 0xFF) i += 2 - elif data[i] == 'x': - hexa = data[i+1:i+3] + elif data[i] == "x": + hexa = data[i + 1 : i + 3] res.append(int(hexa, 16)) i += 2 else: @@ -275,88 +314,160 @@ def escape_decode(data, errors='strict'): res = bytes(res) return res, len(res) -def charmap_decode( data, errors='strict', mapping=None): - """None - """ + +def charmap_decode(data, errors="strict", mapping=None): + """None""" res = PyUnicode_DecodeCharmap(data, len(data), mapping, errors) - res = ''.join(res) + res = "".join(res) return res, len(data) -def utf_7_encode( obj, errors='strict'): - """None - """ +def utf_7_encode(obj, errors="strict"): + """None""" res = PyUnicode_EncodeUTF7(obj, len(obj), 0, 0, errors) - res = b''.join(res) + res = b"".join(res) return res, len(obj) -def mbcs_encode( obj, errors='strict'): - """None - """ + +def mbcs_encode(obj, errors="strict"): + """None""" pass + + ## return (PyUnicode_EncodeMBCS( -## (obj), +## (obj), ## len(obj), ## errors), ## len(obj)) - -def ascii_encode( obj, errors='strict'): - """None - """ + +def ascii_encode(obj, errors="strict"): + """None""" res = PyUnicode_EncodeASCII(obj, len(obj), errors) res = bytes(res) return res, len(obj) -def utf_16_encode( obj, errors='strict'): - """None - """ - res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'native') + +def utf_16_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF16(obj, len(obj), errors, "native") res = bytes(res) return res, len(obj) -def raw_unicode_escape_encode( obj, errors='strict'): - """None - """ + +def raw_unicode_escape_encode(obj, errors="strict"): + """None""" res = PyUnicode_EncodeRawUnicodeEscape(obj, len(obj)) res = bytes(res) return res, len(obj) -def utf_16_le_encode( obj, errors='strict'): - """None - """ - res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'little') + +def utf_16_le_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF16(obj, len(obj), errors, "little") + res = bytes(res) + return res, len(obj) + + +def utf_16_be_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF16(obj, len(obj), errors, "big") + res = bytes(res) + return res, len(obj) + + +def utf_16_le_decode(data, errors="strict", byteorder=0, final=0): + """None""" + consumed = len(data) + if final: + consumed = 0 + res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful( + data, len(data), errors, "little", final + ) + res = "".join(res) + return res, consumed + + +def utf_16_be_decode(data, errors="strict", byteorder=0, final=0): + """None""" + consumed = len(data) + if final: + consumed = 0 + res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful( + data, len(data), errors, "big", final + ) + res = "".join(res) + return res, consumed + + +# UTF-32 codec functions +def utf_32_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF32(obj, len(obj), errors, "native") + res = bytes(res) + return res, len(obj) + + +def utf_32_le_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF32(obj, len(obj), errors, "little") res = bytes(res) return res, len(obj) -def utf_16_be_encode( obj, errors='strict'): - """None - """ - res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'big') + +def utf_32_be_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF32(obj, len(obj), errors, "big") res = bytes(res) return res, len(obj) -def utf_16_le_decode( data, errors='strict', byteorder=0, final = 0): - """None - """ + +def utf_32_decode(data, errors="strict", final=False): + """None""" + consumed = len(data) + if final: + consumed = 0 + res, consumed, byteorder = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, "native", final + ) + res = "".join(res) + return res, consumed + + +def utf_32_le_decode(data, errors="strict", byteorder=0, final=0): + """None""" consumed = len(data) if final: consumed = 0 - res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'little', final) - res = ''.join(res) + res, consumed, byteorder = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, "little", final + ) + res = "".join(res) return res, consumed -def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0): - """None - """ + +def utf_32_be_decode(data, errors="strict", byteorder=0, final=0): + """None""" consumed = len(data) if final: consumed = 0 - res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'big', final) - res = ''.join(res) + res, consumed, byteorder = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, "big", final + ) + res = "".join(res) return res, consumed +def utf_32_ex_decode(data, errors="strict", byteorder=0, final=0): + """None""" + consumed = len(data) + if final: + consumed = 0 + res, consumed, byteorder = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, "native", final + ) + res = "".join(res) + return res, consumed, byteorder # ---------------------------------------------------------------------- @@ -364,9 +475,9 @@ def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0): ##import sys ##""" Python implementation of CPythons builtin unicode codecs. ## -## Generally the functions in this module take a list of characters an returns +## Generally the functions in this module take a list of characters an returns ## a list of characters. -## +## ## For use in the PyPy project""" @@ -376,50 +487,185 @@ def utf_16_be_decode( data, errors='strict', byteorder=0, final = 0): ## 1 - special ## 2 - whitespace (optional) ## 3 - RFC2152 Set O (optional) - + utf7_special = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, - 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, - 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 1, + 1, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 3, + 3, + 3, + 3, + 3, + 3, + 0, + 0, + 0, + 3, + 1, + 0, + 0, + 0, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3, + 3, + 3, + 3, + 0, + 3, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3, + 1, + 3, + 3, + 3, + 3, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3, + 3, + 3, + 1, + 1, ] -unicode_latin1 = [None]*256 +unicode_latin1 = [None] * 256 def SPECIAL(c, encodeO, encodeWS): c = ord(c) - return (c>127 or utf7_special[c] == 1) or \ - (encodeWS and (utf7_special[(c)] == 2)) or \ - (encodeO and (utf7_special[(c)] == 3)) + return ( + (c > 127 or utf7_special[c] == 1) + or (encodeWS and (utf7_special[(c)] == 2)) + or (encodeO and (utf7_special[(c)] == 3)) + ) + + def B64(n): - return bytes([b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]]) + return bytes( + [ + b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[ + (n) & 0x3F + ] + ] + ) + + def B64CHAR(c): - return (c.isalnum() or (c) == b'+' or (c) == b'/') + return c.isalnum() or (c) == b"+" or (c) == b"/" + + def UB64(c): - if (c) == b'+' : - return 62 - elif (c) == b'/': - return 63 - elif (c) >= b'a': - return ord(c) - 71 - elif (c) >= b'A': - return ord(c) - 65 - else: + if (c) == b"+": + return 62 + elif (c) == b"/": + return 63 + elif (c) >= b"a": + return ord(c) - 71 + elif (c) >= b"A": + return ord(c) - 65 + else: return ord(c) + 4 -def ENCODE( ch, bits) : + +def ENCODE(ch, bits): out = [] - while (bits >= 6): - out += B64(ch >> (bits-6)) - bits -= 6 + while bits >= 6: + out += B64(ch >> (bits - 6)) + bits -= 6 return out, bits -def PyUnicode_DecodeUTF7(s, size, errors): +def PyUnicode_DecodeUTF7(s, size, errors): starts = s errmsg = "" inShift = 0 @@ -430,229 +676,233 @@ def PyUnicode_DecodeUTF7(s, size, errors): errorHandler = None exc = None - if (size == 0): - return '' + if size == 0: + return "" i = 0 while i < size: - ch = bytes([s[i]]) - if (inShift): - if ((ch == b'-') or not B64CHAR(ch)): + if inShift: + if (ch == b"-") or not B64CHAR(ch): inShift = 0 i += 1 - - while (bitsleft >= 16): - outCh = ((charsleft) >> (bitsleft-16)) & 0xffff + + while bitsleft >= 16: + outCh = ((charsleft) >> (bitsleft - 16)) & 0xFFFF bitsleft -= 16 - - if (surrogate): + + if surrogate: ## We have already generated an error for the high surrogate - ## so let's not bother seeing if the low surrogate is correct or not + ## so let's not bother seeing if the low surrogate is correct or not surrogate = 0 - elif (0xDC00 <= (outCh) and (outCh) <= 0xDFFF): - ## This is a surrogate pair. Unfortunately we can't represent - ## it in a 16-bit character + elif 0xDC00 <= (outCh) and (outCh) <= 0xDFFF: + ## This is a surrogate pair. Unfortunately we can't represent + ## it in a 16-bit character surrogate = 1 msg = "code pairs are not supported" - out, x = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i) + out, x = unicode_call_errorhandler( + errors, "utf-7", msg, s, i - 1, i + ) p.append(out) bitsleft = 0 break else: - p.append(chr(outCh )) - #p += out - if (bitsleft >= 6): -## /* The shift sequence has a partial character in it. If -## bitsleft < 6 then we could just classify it as padding -## but that is not the case here */ + p.append(chr(outCh)) + # p += out + if bitsleft >= 6: + ## /* The shift sequence has a partial character in it. If + ## bitsleft < 6 then we could just classify it as padding + ## but that is not the case here */ msg = "partial character in shift sequence" - out, x = unicode_call_errorhandler(errors, 'utf-7', msg, s, i-1, i) - -## /* According to RFC2152 the remaining bits should be zero. We -## choose to signal an error/insert a replacement character -## here so indicate the potential of a misencoded character. */ - -## /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ -## if (bitsleft and (charsleft << (sizeof(charsleft) * 8 - bitsleft))): -## raise UnicodeDecodeError, "non-zero padding bits in shift sequence" - if (ch == b'-') : - if ((i < size) and (s[i] == '-')) : - p += '-' + out, x = unicode_call_errorhandler( + errors, "utf-7", msg, s, i - 1, i + ) + + ## /* According to RFC2152 the remaining bits should be zero. We + ## choose to signal an error/insert a replacement character + ## here so indicate the potential of a misencoded character. */ + + ## /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */ + ## if (bitsleft and (charsleft << (sizeof(charsleft) * 8 - bitsleft))): + ## raise UnicodeDecodeError, "non-zero padding bits in shift sequence" + if ch == b"-": + if (i < size) and (s[i] == "-"): + p += "-" inShift = 1 - - elif SPECIAL(ch, 0, 0) : - raise UnicodeDecodeError("unexpected special character") - - else: + + elif SPECIAL(ch, 0, 0): + raise UnicodeDecodeError("unexpected special character") + + else: p.append(chr(ord(ch))) else: charsleft = (charsleft << 6) | UB64(ch) bitsleft += 6 i += 1 -## /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); - elif ( ch == b'+' ): + ## /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate); + elif ch == b"+": startinpos = i i += 1 - if (i 0 else: out.append(bytes([ord(ch)])) else: - if (not SPECIAL(ch, encodeSetO, encodeWhiteSpace)): - out.append(B64((charsleft) << (6-bitsleft))) + if not SPECIAL(ch, encodeSetO, encodeWhiteSpace): + out.append(B64((charsleft) << (6 - bitsleft))) charsleft = 0 bitsleft = 0 -## /* Characters not in the BASE64 set implicitly unshift the sequence -## so no '-' is required, except if the character is itself a '-' */ - if (B64CHAR(ch) or ch == '-'): - out.append(b'-') + ## /* Characters not in the BASE64 set implicitly unshift the sequence + ## so no '-' is required, except if the character is itself a '-' */ + if B64CHAR(ch) or ch == "-": + out.append(b"-") inShift = False out.append(bytes([ord(ch)])) else: bitsleft += 16 - charsleft = (((charsleft) << 16) | ord(ch)) - p, bitsleft = ENCODE(charsleft, bitsleft) + charsleft = ((charsleft) << 16) | ord(ch) + p, bitsleft = ENCODE(charsleft, bitsleft) out.append(p) -## /* If the next character is special then we dont' need to terminate -## the shift sequence. If the next character is not a BASE64 character -## or '-' then the shift sequence will be terminated implicitly and we -## don't have to insert a '-'. */ + ## /* If the next character is special then we dont' need to terminate + ## the shift sequence. If the next character is not a BASE64 character + ## or '-' then the shift sequence will be terminated implicitly and we + ## don't have to insert a '-'. */ - if (bitsleft == 0): - if (i + 1 < size): - ch2 = s[i+1] + if bitsleft == 0: + if i + 1 < size: + ch2 = s[i + 1] - if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)): + if SPECIAL(ch2, encodeSetO, encodeWhiteSpace): pass - elif (B64CHAR(ch2) or ch2 == '-'): - out.append(b'-') + elif B64CHAR(ch2) or ch2 == "-": + out.append(b"-") inShift = False else: inShift = False else: - out.append(b'-') + out.append(b"-") inShift = False i += 1 - - if (bitsleft): - out.append(B64(charsleft << (6-bitsleft) ) ) - out.append(b'-') + + if bitsleft: + out.append(B64(charsleft << (6 - bitsleft))) + out.append(b"-") return out -unicode_empty = '' -def unicodeescape_string(s, size, quotes): +unicode_empty = "" + +def unicodeescape_string(s, size, quotes): p = [] - if (quotes) : - if (s.find('\'') != -1 and s.find('"') == -1): + if quotes: + if s.find("'") != -1 and s.find('"') == -1: p.append(b'"') else: - p.append(b'\'') + p.append(b"'") pos = 0 - while (pos < size): + while pos < size: ch = s[pos] - #/* Escape quotes */ - if (quotes and (ch == p[1] or ch == '\\')): - p.append(b'\\%c' % ord(ch)) + # /* Escape quotes */ + if quotes and (ch == p[1] or ch == "\\"): + p.append(b"\\%c" % ord(ch)) pos += 1 continue -#ifdef Py_UNICODE_WIDE - #/* Map 21-bit characters to '\U00xxxxxx' */ - elif (ord(ch) >= 0x10000): - p.append(b'\\U%08x' % ord(ch)) + # ifdef Py_UNICODE_WIDE + # /* Map 21-bit characters to '\U00xxxxxx' */ + elif ord(ch) >= 0x10000: + p.append(b"\\U%08x" % ord(ch)) pos += 1 - continue -#endif - #/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ - elif (ord(ch) >= 0xD800 and ord(ch) < 0xDC00): + continue + # endif + # /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ + elif ord(ch) >= 0xD800 and ord(ch) < 0xDC00: pos += 1 ch2 = s[pos] - - if (ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF): + + if ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF: ucs = (((ord(ch) & 0x03FF) << 10) | (ord(ch2) & 0x03FF)) + 0x00010000 - p.append(b'\\U%08x' % ucs) + p.append(b"\\U%08x" % ucs) pos += 1 continue - - #/* Fall through: isolated surrogates are copied as-is */ + + # /* Fall through: isolated surrogates are copied as-is */ pos -= 1 - - #/* Map 16-bit characters to '\uxxxx' */ - if (ord(ch) >= 256): - p.append(b'\\u%04x' % ord(ch)) - - #/* Map special whitespace to '\t', \n', '\r' */ - elif (ch == '\t'): - p.append(b'\\t') - - elif (ch == '\n'): - p.append(b'\\n') - - elif (ch == '\r'): - p.append(b'\\r') - - elif (ch == '\\'): - p.append(b'\\\\') - - #/* Map non-printable US ASCII to '\xhh' */ - elif (ch < ' ' or ch >= chr(0x7F)) : - p.append(b'\\x%02x' % ord(ch)) - #/* Copy everything else as-is */ + + # /* Map 16-bit characters to '\uxxxx' */ + if ord(ch) >= 256: + p.append(b"\\u%04x" % ord(ch)) + + # /* Map special whitespace to '\t', \n', '\r' */ + elif ch == "\t": + p.append(b"\\t") + + elif ch == "\n": + p.append(b"\\n") + + elif ch == "\r": + p.append(b"\\r") + + elif ch == "\\": + p.append(b"\\\\") + + # /* Map non-printable US ASCII to '\xhh' */ + elif ch < " " or ch >= chr(0x7F): + p.append(b"\\x%02x" % ord(ch)) + # /* Copy everything else as-is */ else: p.append(bytes([ord(ch)])) pos += 1 - if (quotes): + if quotes: p.append(p[0]) return p -def PyUnicode_DecodeASCII(s, size, errors): -# /* ASCII is equivalent to the first 128 ordinals in Unicode. */ - if (size == 1 and ord(s) < 128) : +def PyUnicode_DecodeASCII(s, size, errors): + # /* ASCII is equivalent to the first 128 ordinals in Unicode. */ + if size == 1 and ord(s) < 128: return [chr(ord(s))] - if (size == 0): - return [''] #unicode('') + if size == 0: + return [""] # unicode('') p = [] pos = 0 while pos < len(s): @@ -661,54 +911,50 @@ def PyUnicode_DecodeASCII(s, size, errors): p += chr(c) pos += 1 else: - res = unicode_call_errorhandler( - errors, "ascii", "ordinal not in range(128)", - s, pos, pos+1) + errors, "ascii", "ordinal not in range(128)", s, pos, pos + 1 + ) p += res[0] pos = res[1] return p -def PyUnicode_EncodeASCII(p, size, errors): +def PyUnicode_EncodeASCII(p, size, errors): return unicode_encode_ucs1(p, size, errors, 128) -def PyUnicode_AsASCIIString(unistr): +def PyUnicode_AsASCIIString(unistr): if not type(unistr) == str: raise TypeError - return PyUnicode_EncodeASCII(str(unistr), - len(str), - None) + return PyUnicode_EncodeASCII(str(unistr), len(str), None) -def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=True): - bo = 0 #/* assume native ordering by default */ +def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder="native", final=True): + bo = 0 # /* assume native ordering by default */ consumed = 0 errmsg = "" - if sys.byteorder == 'little': + if sys.byteorder == "little": ihi = 1 ilo = 0 else: ihi = 0 ilo = 1 - - #/* Unpack UTF-16 encoded data */ + # /* Unpack UTF-16 encoded data */ -## /* Check for BOM marks (U+FEFF) in the input and adjust current -## byte order setting accordingly. In native mode, the leading BOM -## mark is skipped, in all other modes, it is copied to the output -## stream as-is (giving a ZWNBSP character). */ + ## /* Check for BOM marks (U+FEFF) in the input and adjust current + ## byte order setting accordingly. In native mode, the leading BOM + ## mark is skipped, in all other modes, it is copied to the output + ## stream as-is (giving a ZWNBSP character). */ q = 0 p = [] - if byteorder == 'native': - if (size >= 2): + if byteorder == "native": + if size >= 2: bom = (s[ihi] << 8) | s[ilo] -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - if sys.byteorder == 'little': - if (bom == 0xFEFF): + # ifdef BYTEORDER_IS_LITTLE_ENDIAN + if sys.byteorder == "little": + if bom == 0xFEFF: q += 2 bo = -1 elif bom == 0xFFFE: @@ -721,238 +967,367 @@ def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=Tru elif bom == 0xFFFE: q += 2 bo = -1 - elif byteorder == 'little': + elif byteorder == "little": bo = -1 else: bo = 1 - - if (size == 0): - return [''], 0, bo - - if (bo == -1): - #/* force LE */ + + if size == 0: + return [""], 0, bo + + if bo == -1: + # /* force LE */ ihi = 1 ilo = 0 - elif (bo == 1): - #/* force BE */ + elif bo == 1: + # /* force BE */ ihi = 0 ilo = 1 - while (q < len(s)): - - #/* remaining bytes at the end? (size should be even) */ - if (len(s)-q<2): + while q < len(s): + # /* remaining bytes at the end? (size should be even) */ + if len(s) - q < 2: if not final: break errmsg = "truncated data" startinpos = q endinpos = len(s) - unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True) -# /* The remaining input chars are ignored if the callback -## chooses to skip the input */ - - ch = (s[q+ihi] << 8) | s[q+ilo] + unicode_call_errorhandler( + errors, "utf-16", errmsg, s, startinpos, endinpos, True + ) + # /* The remaining input chars are ignored if the callback + ## chooses to skip the input */ + + ch = (s[q + ihi] << 8) | s[q + ilo] q += 2 - - if (ch < 0xD800 or ch > 0xDFFF): + + if ch < 0xD800 or ch > 0xDFFF: p.append(chr(ch)) continue - - #/* UTF-16 code pair: */ - if (q >= len(s)): + + # /* UTF-16 code pair: */ + if q >= len(s): errmsg = "unexpected end of data" - startinpos = q-2 + startinpos = q - 2 endinpos = len(s) - unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True) + unicode_call_errorhandler( + errors, "utf-16", errmsg, s, startinpos, endinpos, True + ) - if (0xD800 <= ch and ch <= 0xDBFF): - ch2 = (s[q+ihi] << 8) | s[q+ilo] + if 0xD800 <= ch and ch <= 0xDBFF: + ch2 = (s[q + ihi] << 8) | s[q + ilo] q += 2 - if (0xDC00 <= ch2 and ch2 <= 0xDFFF): - #ifndef Py_UNICODE_WIDE + if 0xDC00 <= ch2 and ch2 <= 0xDFFF: + # ifndef Py_UNICODE_WIDE if sys.maxunicode < 65536: p += [chr(ch), chr(ch2)] else: - p.append(chr((((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000)) - #endif + p.append(chr((((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000)) + # endif continue else: errmsg = "illegal UTF-16 surrogate" - startinpos = q-4 - endinpos = startinpos+2 - unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True) - + startinpos = q - 4 + endinpos = startinpos + 2 + unicode_call_errorhandler( + errors, "utf-16", errmsg, s, startinpos, endinpos, True + ) + errmsg = "illegal encoding" - startinpos = q-2 - endinpos = startinpos+2 - unicode_call_errorhandler(errors, 'utf-16', errmsg, s, startinpos, endinpos, True) - + startinpos = q - 2 + endinpos = startinpos + 2 + unicode_call_errorhandler( + errors, "utf-16", errmsg, s, startinpos, endinpos, True + ) + return p, q, bo + # moved out of local scope, especially because it didn't # have any nested variables. + def STORECHAR(CH, byteorder): - hi = (CH >> 8) & 0xff - lo = CH & 0xff - if byteorder == 'little': + hi = (CH >> 8) & 0xFF + lo = CH & 0xFF + if byteorder == "little": return [lo, hi] else: return [hi, lo] -def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'): -# /* Offsets from p for storing byte pairs in the right order. */ +def PyUnicode_EncodeUTF16(s, size, errors, byteorder="little"): + # /* Offsets from p for storing byte pairs in the right order. */ - p = [] bom = sys.byteorder - if (byteorder == 'native'): - + if byteorder == "native": bom = sys.byteorder p += STORECHAR(0xFEFF, bom) - - if (size == 0): - return "" - if (byteorder == 'little' ): - bom = 'little' - elif (byteorder == 'big'): - bom = 'big' + if size == 0: + return "" + if byteorder == "little": + bom = "little" + elif byteorder == "big": + bom = "big" for c in s: ch = ord(c) ch2 = 0 - if (ch >= 0x10000) : - ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF) - ch = 0xD800 | ((ch-0x10000) >> 10) + if ch >= 0x10000: + ch2 = 0xDC00 | ((ch - 0x10000) & 0x3FF) + ch = 0xD800 | ((ch - 0x10000) >> 10) p += STORECHAR(ch, bom) - if (ch2): + if ch2: p += STORECHAR(ch2, bom) return p +def PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder="native", final=True): + bo = 0 # /* assume native ordering by default */ + consumed = 0 + errmsg = "" + + if sys.byteorder == "little": + iorder = [0, 1, 2, 3] + else: + iorder = [3, 2, 1, 0] + + q = 0 + if byteorder == "native": + if size >= 4: + bom = ( + (s[iorder[3]] << 24) + | (s[iorder[2]] << 16) + | (s[iorder[1]] << 8) + | s[iorder[0]] + ) + if bom == 0x0000FEFF: + q += 4 + bo = -1 + elif bom == 0xFFFE0000: + q += 4 + bo = 1 + if sys.byteorder == "little": + iorder = [3, 2, 1, 0] + else: + iorder = [0, 1, 2, 3] + else: + bo = 0 + elif byteorder == "little": + bo = -1 + iorder = [0, 1, 2, 3] + else: + bo = 1 + iorder = [3, 2, 1, 0] + + if size == 0: + return ("", 0, bo) + + if ((size - q) & 3) != 0 and not final: + errmsg = "truncated data" + startinpos = size & ~3 + endinpos = size + unicode_call_errorhandler(errors, "utf-32", errmsg, s, startinpos, endinpos) + + p = [] + while q < size - 3: + ch = ( + (s[q + iorder[3]] << 24) + | (s[q + iorder[2]] << 16) + | (s[q + iorder[1]] << 8) + | s[q + iorder[0]] + ) + q += 4 + + if ch > 0x10FFFF: + errmsg = "codepoint out of range" + startinpos = q - 4 + endinpos = q + res = unicode_call_errorhandler( + errors, "utf-32", errmsg, s, startinpos, endinpos + ) + p += res[0] + q = res[1] + else: + p.append(chr(ch)) + + consumed = q + + if not final and consumed < size: + consumed = size & ~3 + + return (p, consumed, bo) + + +def PyUnicode_EncodeUTF32(s, size, errors, byteorder="little"): + def STORECHAR32(ch, byteorder): + if byteorder == "little": + return [ch & 0xFF, (ch >> 8) & 0xFF, (ch >> 16) & 0xFF, (ch >> 24) & 0xFF] + else: + return [(ch >> 24) & 0xFF, (ch >> 16) & 0xFF, (ch >> 8) & 0xFF, ch & 0xFF] + + p = [] + if byteorder == "native": + bom = sys.byteorder + p += STORECHAR32(0x0000FEFF, bom) + else: + bom = byteorder + + if size == 0: + return p + + for c in s: + ch = ord(c) + p += STORECHAR32(ch, bom) + + return p + + def PyUnicode_DecodeMBCS(s, size, errors): pass + def PyUnicode_EncodeMBCS(p, size, errors): pass -def unicode_call_errorhandler(errors, encoding, - reason, input, startinpos, endinpos, decode=True): - + +def unicode_call_errorhandler( + errors, encoding, reason, input, startinpos, endinpos, decode=True +): errorHandler = lookup_error(errors) if decode: - exceptionObject = UnicodeDecodeError(encoding, input, startinpos, endinpos, reason) + exceptionObject = UnicodeDecodeError( + encoding, input, startinpos, endinpos, reason + ) else: - exceptionObject = UnicodeEncodeError(encoding, input, startinpos, endinpos, reason) + exceptionObject = UnicodeEncodeError( + encoding, input, startinpos, endinpos, reason + ) res = errorHandler(exceptionObject) if isinstance(res, tuple) and isinstance(res[0], str) and isinstance(res[1], int): newpos = res[1] - if (newpos < 0): + if newpos < 0: newpos = len(input) + newpos if newpos < 0 or newpos > len(input): - raise IndexError( "position %d from error handler out of bounds" % newpos) + raise IndexError("position %d from error handler out of bounds" % newpos) return res[0], newpos else: - raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res)) + raise TypeError( + "encoding error handler must return (unicode, int) tuple, not %s" + % repr(res) + ) + + +# /* --- Latin-1 Codec ------------------------------------------------------ */ -#/* --- Latin-1 Codec ------------------------------------------------------ */ def PyUnicode_DecodeLatin1(s, size, errors): - #/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ -## if (size == 1): -## return [PyUnicode_FromUnicode(s, 1)] + # /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ + ## if (size == 1): + ## return [PyUnicode_FromUnicode(s, 1)] pos = 0 p = [] - while (pos < size): + while pos < size: p += chr(s[pos]) pos += 1 return p + def unicode_encode_ucs1(p, size, errors, limit): - if limit == 256: reason = "ordinal not in range(256)" encoding = "latin-1" else: reason = "ordinal not in range(128)" encoding = "ascii" - - if (size == 0): + + if size == 0: return [] res = bytearray() pos = 0 while pos < len(p): - #for ch in p: + # for ch in p: ch = p[pos] - + if ord(ch) < limit: res.append(ord(ch)) pos += 1 else: - #/* startpos for collecting unencodable chars */ - collstart = pos - collend = pos+1 + # /* startpos for collecting unencodable chars */ + collstart = pos + collend = pos + 1 while collend < len(p) and ord(p[collend]) >= limit: collend += 1 - x = unicode_call_errorhandler(errors, encoding, reason, p, collstart, collend, False) + x = unicode_call_errorhandler( + errors, encoding, reason, p, collstart, collend, False + ) res += x[0].encode() pos = x[1] - + return res + def PyUnicode_EncodeLatin1(p, size, errors): res = unicode_encode_ucs1(p, size, errors, 256) return res -hexdigits = [ord(hex(i)[-1]) for i in range(16)]+[ord(hex(i)[-1].upper()) for i in range(10, 16)] + +hexdigits = [ord(hex(i)[-1]) for i in range(16)] + [ + ord(hex(i)[-1].upper()) for i in range(10, 16) +] + def hex_number_end(s, pos, digits): target_end = pos + digits - while pos < target_end and pos < len(s) and s[pos] in hexdigits: + while pos < target_end and pos < len(s) and s[pos] in hexdigits: pos += 1 return pos + def hexescape(s, pos, digits, message, errors): ch = 0 p = [] number_end = hex_number_end(s, pos, digits) if number_end - pos != digits: - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2, number_end) + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 2, number_end + ) p.append(x[0]) pos = x[1] else: - ch = int(s[pos:pos+digits], 16) - #/* when we get here, ch is a 32-bit unicode character */ + ch = int(s[pos : pos + digits], 16) + # /* when we get here, ch is a 32-bit unicode character */ if ch <= sys.maxunicode: p.append(chr(ch)) pos += digits - elif (ch <= 0x10ffff): + elif ch <= 0x10FFFF: ch -= 0x10000 p.append(chr(0xD800 + (ch >> 10))) - p.append(chr(0xDC00 + (ch & 0x03FF))) + p.append(chr(0xDC00 + (ch & 0x03FF))) pos += digits else: message = "illegal Unicode character" - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2, - pos+digits) + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 2, pos + digits + ) p.append(x[0]) pos = x[1] res = p return res, pos + def PyUnicode_DecodeUnicodeEscape(s, size, errors, final): + if size == 0: + return "" - if (size == 0): - return '' - if isinstance(s, str): s = s.encode() @@ -960,129 +1335,154 @@ def PyUnicode_DecodeUnicodeEscape(s, size, errors, final): p = [] pos = 0 - while (pos < size): -## /* Non-escape characters are interpreted as Unicode ordinals */ - if (chr(s[pos]) != '\\') : + while pos < size: + ## /* Non-escape characters are interpreted as Unicode ordinals */ + if chr(s[pos]) != "\\": p.append(chr(s[pos])) pos += 1 continue -## /* \ - Escapes */ + ## /* \ - Escapes */ else: pos += 1 if pos >= len(s): errmessage = "\\ at end of string" - unicode_call_errorhandler(errors, "unicodeescape", errmessage, s, pos-1, size) + unicode_call_errorhandler( + errors, "unicodeescape", errmessage, s, pos - 1, size + ) ch = chr(s[pos]) pos += 1 - ## /* \x escapes */ - if ch == '\n': pass - elif ch == '\\': p += '\\' - elif ch == '\'': p += '\'' - elif ch == '\"': p += '\"' - elif ch == 'b' : p += '\b' - elif ch == 'f' : p += '\014' #/* FF */ - elif ch == 't' : p += '\t' - elif ch == 'n' : p += '\n' - elif ch == 'r' : p += '\r' - elif ch == 'v' : p += '\013' #break; /* VT */ - elif ch == 'a' : p += '\007' # break; /* BEL, not classic C */ - elif '0' <= ch <= '7': - x = ord(ch) - ord('0') + ## /* \x escapes */ + if ch == "\n": + pass + elif ch == "\\": + p += "\\" + elif ch == "'": + p += "'" + elif ch == '"': + p += '"' + elif ch == "b": + p += "\b" + elif ch == "f": + p += "\014" # /* FF */ + elif ch == "t": + p += "\t" + elif ch == "n": + p += "\n" + elif ch == "r": + p += "\r" + elif ch == "v": + p += "\013" # break; /* VT */ + elif ch == "a": + p += "\007" # break; /* BEL, not classic C */ + elif "0" <= ch <= "7": + x = ord(ch) - ord("0") if pos < size: ch = chr(s[pos]) - if '0' <= ch <= '7': + if "0" <= ch <= "7": pos += 1 - x = (x<<3) + ord(ch) - ord('0') + x = (x << 3) + ord(ch) - ord("0") if pos < size: ch = chr(s[pos]) - if '0' <= ch <= '7': + if "0" <= ch <= "7": pos += 1 - x = (x<<3) + ord(ch) - ord('0') + x = (x << 3) + ord(ch) - ord("0") p.append(chr(x)) - ## /* hex escapes */ - ## /* \xXX */ - elif ch == 'x': + ## /* hex escapes */ + ## /* \xXX */ + elif ch == "x": digits = 2 message = "truncated \\xXX escape" x = hexescape(s, pos, digits, message, errors) p += x[0] pos = x[1] - - # /* \uXXXX */ - elif ch == 'u': + + # /* \uXXXX */ + elif ch == "u": digits = 4 message = "truncated \\uXXXX escape" x = hexescape(s, pos, digits, message, errors) p += x[0] pos = x[1] - - # /* \UXXXXXXXX */ - elif ch == 'U': + + # /* \UXXXXXXXX */ + elif ch == "U": digits = 8 message = "truncated \\UXXXXXXXX escape" x = hexescape(s, pos, digits, message, errors) p += x[0] pos = x[1] -## /* \N{name} */ - elif ch == 'N': + ## /* \N{name} */ + elif ch == "N": message = "malformed \\N character escape" # pos += 1 look = pos try: import unicodedata except ImportError: - message = "\\N escapes not supported (can't load unicodedata module)" - unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, size) - if look < size and chr(s[look]) == '{': - #/* look for the closing brace */ - while (look < size and chr(s[look]) != '}'): + message = ( + "\\N escapes not supported (can't load unicodedata module)" + ) + unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 1, size + ) + if look < size and chr(s[look]) == "{": + # /* look for the closing brace */ + while look < size and chr(s[look]) != "}": look += 1 - if (look > pos+1 and look < size and chr(s[look]) == '}'): - #/* found a name. look it up in the unicode database */ + if look > pos + 1 and look < size and chr(s[look]) == "}": + # /* found a name. look it up in the unicode database */ message = "unknown Unicode character name" - st = s[pos+1:look] + st = s[pos + 1 : look] try: chr_codec = unicodedata.lookup("%s" % st) except LookupError as e: - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 1, look + 1 + ) else: - x = chr_codec, look + 1 + x = chr_codec, look + 1 p.append(x[0]) pos = x[1] - else: - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) - else: - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) + else: + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 1, look + 1 + ) + else: + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 1, look + 1 + ) else: if not found_invalid_escape: found_invalid_escape = True - warnings.warn("invalid escape sequence '\\%c'" % ch, DeprecationWarning, 2) - p.append('\\') + warnings.warn( + "invalid escape sequence '\\%c'" % ch, DeprecationWarning, 2 + ) + p.append("\\") p.append(ch) return p + def PyUnicode_EncodeRawUnicodeEscape(s, size): - - if (size == 0): - return b'' + if size == 0: + return b"" p = bytearray() for ch in s: -# /* Map 32-bit characters to '\Uxxxxxxxx' */ - if (ord(ch) >= 0x10000): - p += b'\\U%08x' % ord(ch) - elif (ord(ch) >= 256) : -# /* Map 16-bit characters to '\uxxxx' */ - p += b'\\u%04x' % (ord(ch)) -# /* Copy everything else as-is */ + # /* Map 32-bit characters to '\Uxxxxxxxx' */ + if ord(ch) >= 0x10000: + p += b"\\U%08x" % ord(ch) + elif ord(ch) >= 256: + # /* Map 16-bit characters to '\uxxxx' */ + p += b"\\u%04x" % (ord(ch)) + # /* Copy everything else as-is */ else: p.append(ord(ch)) - - #p += '\0' + + # p += '\0' return p -def charmapencode_output(c, mapping): +def charmapencode_output(c, mapping): rep = mapping[c] if isinstance(rep, int) or isinstance(rep, int): if rep < 256: @@ -1098,49 +1498,56 @@ def charmapencode_output(c, mapping): else: raise TypeError("character mapping must return integer, None or str") -def PyUnicode_EncodeCharmap(p, size, mapping='latin-1', errors='strict'): -## /* the following variable is used for caching string comparisons -## * -1=not initialized, 0=unknown, 1=strict, 2=replace, -## * 3=ignore, 4=xmlcharrefreplace */ +def PyUnicode_EncodeCharmap(p, size, mapping="latin-1", errors="strict"): + ## /* the following variable is used for caching string comparisons + ## * -1=not initialized, 0=unknown, 1=strict, 2=replace, + ## * 3=ignore, 4=xmlcharrefreplace */ -# /* Default to Latin-1 */ - if mapping == 'latin-1': + # /* Default to Latin-1 */ + if mapping == "latin-1": return PyUnicode_EncodeLatin1(p, size, errors) - if (size == 0): - return b'' + if size == 0: + return b"" inpos = 0 res = [] - while (inpos", p, inpos, inpos+1, False) + x = unicode_call_errorhandler( + errors, + "charmap", + "character maps to ", + p, + inpos, + inpos + 1, + False, + ) try: for y in x[0]: res += charmapencode_output(ord(y), mapping) except KeyError: - raise UnicodeEncodeError("charmap", p, inpos, inpos+1, - "character maps to ") + raise UnicodeEncodeError( + "charmap", p, inpos, inpos + 1, "character maps to " + ) inpos += 1 return res -def PyUnicode_DecodeCharmap(s, size, mapping, errors): -## /* Default to Latin-1 */ - if (mapping == None): +def PyUnicode_DecodeCharmap(s, size, mapping, errors): + ## /* Default to Latin-1 */ + if mapping == None: return PyUnicode_DecodeLatin1(s, size, errors) - if (size == 0): - return '' + if size == 0: + return "" p = [] inpos = 0 - while (inpos< len(s)): - - #/* Get mapping (char ordinal -> integer, Unicode char or None) */ + while inpos < len(s): + # /* Get mapping (char ordinal -> integer, Unicode char or None) */ ch = s[inpos] try: x = mapping[ch] @@ -1156,84 +1563,94 @@ def PyUnicode_DecodeCharmap(s, size, mapping, errors): else: raise TypeError except KeyError: - x = unicode_call_errorhandler(errors, "charmap", - "character maps to ", s, inpos, inpos+1) + x = unicode_call_errorhandler( + errors, "charmap", "character maps to ", s, inpos, inpos + 1 + ) p += x[0] inpos += 1 return p -def PyUnicode_DecodeRawUnicodeEscape(s, size, errors, final): - if (size == 0): - return '' +def PyUnicode_DecodeRawUnicodeEscape(s, size, errors, final): + if size == 0: + return "" if isinstance(s, str): s = s.encode() pos = 0 p = [] - while (pos < len(s)): + while pos < len(s): ch = chr(s[pos]) - #/* Non-escape characters are interpreted as Unicode ordinals */ - if (ch != '\\'): + # /* Non-escape characters are interpreted as Unicode ordinals */ + if ch != "\\": p.append(ch) pos += 1 - continue + continue startinpos = pos -## /* \u-escapes are only interpreted iff the number of leading -## backslashes is odd */ + ## /* \u-escapes are only interpreted iff the number of leading + ## backslashes is odd */ bs = pos while pos < size: - if (s[pos] != ord('\\')): + if s[pos] != ord("\\"): break p.append(chr(s[pos])) pos += 1 - - if (pos >= size): + + if pos >= size: break - if (((pos - bs) & 1) == 0 or - (s[pos] != ord('u') and s[pos] != ord('U'))) : + if ((pos - bs) & 1) == 0 or (s[pos] != ord("u") and s[pos] != ord("U")): p.append(chr(s[pos])) pos += 1 continue - + p.pop(-1) - if s[pos] == ord('u'): - count = 4 - else: + if s[pos] == ord("u"): + count = 4 + else: count = 8 pos += 1 - #/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ + # /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ number_end = hex_number_end(s, pos, count) if number_end - pos != count: res = unicode_call_errorhandler( - errors, "rawunicodeescape", "truncated \\uXXXX", - s, pos-2, number_end) + errors, "rawunicodeescape", "truncated \\uXXXX", s, pos - 2, number_end + ) p.append(res[0]) pos = res[1] else: - x = int(s[pos:pos+count], 16) - #ifndef Py_UNICODE_WIDE - if sys.maxunicode > 0xffff: - if (x > sys.maxunicode): + x = int(s[pos : pos + count], 16) + # ifndef Py_UNICODE_WIDE + if sys.maxunicode > 0xFFFF: + if x > sys.maxunicode: res = unicode_call_errorhandler( - errors, "rawunicodeescape", "\\Uxxxxxxxx out of range", - s, pos-2, pos+count) + errors, + "rawunicodeescape", + "\\Uxxxxxxxx out of range", + s, + pos - 2, + pos + count, + ) pos = res[1] p.append(res[0]) else: p.append(chr(x)) pos += count else: - if (x > 0x10000): + if x > 0x10000: res = unicode_call_errorhandler( - errors, "rawunicodeescape", "\\Uxxxxxxxx out of range", - s, pos-2, pos+count) + errors, + "rawunicodeescape", + "\\Uxxxxxxxx out of range", + s, + pos - 2, + pos + count, + ) pos = res[1] p.append(res[0]) - #endif + # endif else: p.append(chr(x)) pos += count diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 6f402513fd..1dfb73779b 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -1201,8 +1201,6 @@ def test___contains__(self): self.checkequal(False, '', '__contains__', 'asdf') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_subscript(self): self.checkequal('a', 'abc', '__getitem__', 0) self.checkequal('c', 'abc', '__getitem__', -1) diff --git a/Lib/test/test_array.py b/Lib/test/test_array.py index be89bec522..643026444e 100644 --- a/Lib/test/test_array.py +++ b/Lib/test/test_array.py @@ -357,8 +357,6 @@ def test_reverse_iterator(self): self.assertEqual(list(a), list(self.example)) self.assertEqual(list(reversed(a)), list(iter(a))[::-1]) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_reverse_iterator_picking(self): orig = array.array(self.typecode, self.example) data = list(orig) diff --git a/vm/src/stdlib/codecs.rs b/vm/src/stdlib/codecs.rs index c0a091bcf8..2299cd2b7a 100644 --- a/vm/src/stdlib/codecs.rs +++ b/vm/src/stdlib/codecs.rs @@ -305,5 +305,32 @@ mod _codecs { fn utf_16_ex_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { delegate_pycodecs!(utf_16_ex_decode, args, vm) } - // TODO: utf-32 functions + #[pyfunction] + fn utf_32_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { + delegate_pycodecs!(utf_32_encode, args, vm) + } + #[pyfunction] + fn utf_32_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { + delegate_pycodecs!(utf_32_decode, args, vm) + } + #[pyfunction] + fn utf_32_le_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { + delegate_pycodecs!(utf_32_le_encode, args, vm) + } + #[pyfunction] + fn utf_32_le_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { + delegate_pycodecs!(utf_32_le_decode, args, vm) + } + #[pyfunction] + fn utf_32_be_encode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { + delegate_pycodecs!(utf_32_be_encode, args, vm) + } + #[pyfunction] + fn utf_32_be_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { + delegate_pycodecs!(utf_32_be_decode, args, vm) + } + #[pyfunction] + fn utf_32_ex_decode(args: FuncArgs, vm: &VirtualMachine) -> PyResult { + delegate_pycodecs!(utf_32_ex_decode, args, vm) + } } From 1aa3fa3e1b56376aa18d3f06969cff4fd3a83502 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Sun, 22 Jun 2025 22:56:02 +0900 Subject: [PATCH 2/3] utf32? --- Lib/test/test_array.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_array.py b/Lib/test/test_array.py index 643026444e..0ba4e022dc 100644 --- a/Lib/test/test_array.py +++ b/Lib/test/test_array.py @@ -176,15 +176,13 @@ def test_numbers(self): self.assertEqual(a, b, msg="{0!r} != {1!r}; testcase={2!r}".format(a, b, testcase)) - # TODO: RUSTPYTHON - requires UTF-32 encoding support in codecs and proper array reconstructor implementation - @unittest.expectedFailure def test_unicode(self): teststr = "Bonne Journ\xe9e \U0002030a\U00020347" testcases = ( (UTF16_LE, "UTF-16-LE"), (UTF16_BE, "UTF-16-BE"), - (UTF32_LE, "UTF-32-LE"), # TODO: RUSTPYTHON - (UTF32_BE, "UTF-32-BE") # TODO: RUSTPYTHON + (UTF32_LE, "UTF-32-LE"), + (UTF32_BE, "UTF-32-BE") ) for testcase in testcases: mformat_code, encoding = testcase From 50ef0d062b2b6bea8c8c49a6b7ba3d8d13b38775 Mon Sep 17 00:00:00 2001 From: Jeong YunWon Date: Tue, 24 Jun 2025 10:35:39 +0900 Subject: [PATCH 3/3] codec --- Lib/_pycodecs.py | 70 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/Lib/_pycodecs.py b/Lib/_pycodecs.py index 48483f5772..857b53a503 100644 --- a/Lib/_pycodecs.py +++ b/Lib/_pycodecs.py @@ -1075,16 +1075,46 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder="little"): elif byteorder == "big": bom = "big" - for c in s: - ch = ord(c) - ch2 = 0 - if ch >= 0x10000: + i = 0 + while i < len(s): + ch = ord(s[i]) + + # Check for surrogates - each surrogate is invalid in UTF-16 + # regardless of whether it could form a pair + if 0xD800 <= ch <= 0xDFFF: + # Surrogate - handle with error handler + startinpos = i + endinpos = i + 1 + res = unicode_call_errorhandler( + errors, "utf-16-le" if bom == "little" else "utf-16-be", + "surrogates not allowed", s, startinpos, endinpos + ) + # res[0] is the replacement string, res[1] is the new position + for replacement_char in res[0]: + rch = ord(replacement_char) + if rch >= 0x10000: + # Encode as surrogate pair + rch2 = 0xDC00 | ((rch - 0x10000) & 0x3FF) + rch = 0xD800 | ((rch - 0x10000) >> 10) + p += STORECHAR(rch, bom) + p += STORECHAR(rch2, bom) + elif 0xD800 <= rch <= 0xDFFF: + # Don't encode surrogates in the replacement + pass + else: + p += STORECHAR(rch, bom) + i = res[1] + elif ch >= 0x10000: + # Regular character above BMP - encode as surrogate pair ch2 = 0xDC00 | ((ch - 0x10000) & 0x3FF) ch = 0xD800 | ((ch - 0x10000) >> 10) - - p += STORECHAR(ch, bom) - if ch2: + p += STORECHAR(ch, bom) p += STORECHAR(ch2, bom) + i += 1 + else: + # Regular BMP character + p += STORECHAR(ch, bom) + i += 1 return p @@ -1183,9 +1213,29 @@ def STORECHAR32(ch, byteorder): if size == 0: return p - for c in s: - ch = ord(c) - p += STORECHAR32(ch, bom) + i = 0 + while i < len(s): + ch = ord(s[i]) + + # Check for surrogates - they are not valid in UTF-32 + if 0xD800 <= ch <= 0xDFFF: + # Surrogate - handle with error handler + startinpos = i + endinpos = i + 1 + res = unicode_call_errorhandler( + errors, "utf-32-le" if bom == "little" else "utf-32-be", + "surrogates not allowed", s, startinpos, endinpos, False + ) + # res[0] is the replacement string, res[1] is the new position + for replacement_char in res[0]: + rch = ord(replacement_char) + # Don't encode surrogates in the replacement + if not (0xD800 <= rch <= 0xDFFF): + p += STORECHAR32(rch, bom) + i = res[1] + else: + p += STORECHAR32(ch, bom) + i += 1 return p