From 13286d116667d6ef3c1bc9a4dd7cfdeeb710b5ae Mon Sep 17 00:00:00 2001 From: Masayuki Moriyama Date: Fri, 3 Nov 2023 21:39:48 +0900 Subject: [PATCH 1/3] gh-101180: Fix a bug where iso2022_jp_3 and iso2022_jp_2004 codecs read out of bounds iso2022_jp_3 and iso2022_jp_2004 codecs read out of bounds when encoding Unicode combining character sequence. This bug ocurs the following error: $ python3 -c "print('\u304b\u309a'.encode('iso2022_jp_2004'))" Traceback (most recent call last): File "", line 1, in UnicodeEncodeError: 'iso2022_jp_2004' codec can't encode character '\u309a' in position 1: illegal multibyte sequence This commit fixes the out-of-bounds read. --- Modules/cjkcodecs/_codecs_iso2022.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Modules/cjkcodecs/_codecs_iso2022.c b/Modules/cjkcodecs/_codecs_iso2022.c index 86bb73b982a551..e8835ad0909633 100644 --- a/Modules/cjkcodecs/_codecs_iso2022.c +++ b/Modules/cjkcodecs/_codecs_iso2022.c @@ -207,8 +207,9 @@ ENCODER(iso2022) encoded = MAP_UNMAPPABLE; for (dsg = CONFIG_DESIGNATIONS; dsg->mark; dsg++) { + Py_UCS4 buf[2] = {c, 0}; Py_ssize_t length = 1; - encoded = dsg->encoder(codec, &c, &length); + encoded = dsg->encoder(codec, buf, &length); if (encoded == MAP_MULTIPLE_AVAIL) { /* this implementation won't work for pair * of non-bmp characters. */ @@ -217,9 +218,11 @@ ENCODER(iso2022) return MBERR_TOOFEW; length = -1; } - else + else { + buf[1] = INCHAR2; length = 2; - encoded = dsg->encoder(codec, &c, &length); + } + encoded = dsg->encoder(codec, buf, &length); if (encoded != MAP_UNMAPPABLE) { insize = length; break; From ddff542d84d24c82fd23c4137e2c6362c617d22c Mon Sep 17 00:00:00 2001 From: Masayuki Moriyama Date: Fri, 3 Nov 2023 21:40:01 +0900 Subject: [PATCH 2/3] gh-101180: Add test for iso2022_jp_3 and iso2022_jp_2004 codecs iso2022_jp_3 and iso2022_jp_2004 are upward compatible with iso2022_jp. In addition to testing iso2022_jp, we will test the following characters added in iso2022_jp_3 and iso2022_jp_2004. JIS X 0213 Unicode ---------------- --------------------------------------------- Plane 1 \x2E\x23 U+3402 Basic Multilingual Plane Plane 1 \x2E\x22 U+2000B Supplementary Ideographic Plane Plane 1 \x24\x77 U+304B U+309A Combining Character Suqence Plane 2 \x21\x22 U+4E02 Basic Multilingual Plane Plane 2 \x7E\x76 U+2A6B2 Supplementary Ideographic Plane The difference between iso2022_jp_3 and iso2022_jp_2004 is the difference between JIS X 0213:2000 and JIS X 0213:2004. Tests the following a character added from JIS X 0213:2000 to JIS X 0213:2004. JIS X 0213:2004 Unicode ---------------- ------- Plane 1 \x2E\x21 U+4FF1 Escape sequence to designate JIS X 0213 character set to G0: character set ESC sequence ----------------------- --------------------------- JIS X 0213:2000 Plane 1 ESC 2/4 2/8 4/15 ESC $ ( O JIS X 0213:2000 Plane 2 ESC 2/4 2/8 5/0 ESC $ ( P JIS X 0213:2004 Plane 1 ESC 2/4 2/8 5/1 ESC $ ( Q JIS X 0213:2004 Plane 2 ESC 2/4 2/8 5/0 ESC $ ( P --- Lib/test/test_codecencodings_iso2022.py | 46 +++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/Lib/test/test_codecencodings_iso2022.py b/Lib/test/test_codecencodings_iso2022.py index 00ea1c39dd6fb6..027dbecc6134df 100644 --- a/Lib/test/test_codecencodings_iso2022.py +++ b/Lib/test/test_codecencodings_iso2022.py @@ -24,6 +24,52 @@ class Test_ISO2022_JP2(multibytecodec_support.TestBase, unittest.TestCase): (b'ab\x1BNdef', 'replace', 'abdef'), ) +class Test_ISO2022_JP3(multibytecodec_support.TestBase, unittest.TestCase): + encoding = 'iso2022_jp_3' + tstring = multibytecodec_support.load_teststring('iso2022_jp') + codectests = COMMON_CODEC_TESTS + ( + (b'ab\x1BNdef', 'replace', 'ab\x1BNdef'), + (b'\x1B$(O\x2E\x23\x1B(B', 'strict', '\u3402' ), + (b'\x1B$(O\x2E\x22\x1B(B', 'strict', '\U0002000B' ), + (b'\x1B$(O\x24\x77\x1B(B', 'strict', '\u304B\u309A'), + (b'\x1B$(P\x21\x22\x1B(B', 'strict', '\u4E02' ), + (b'\x1B$(P\x7E\x76\x1B(B', 'strict', '\U0002A6B2' ), + ('\u3402', 'strict', b'\x1B$(O\x2E\x23\x1B(B'), + ('\U0002000B', 'strict', b'\x1B$(O\x2E\x22\x1B(B'), + ('\u304B\u309A', 'strict', b'\x1B$(O\x24\x77\x1B(B'), + ('\u4E02', 'strict', b'\x1B$(P\x21\x22\x1B(B'), + ('\U0002A6B2', 'strict', b'\x1B$(P\x7E\x76\x1B(B'), + (b'ab\x1B$(O\x2E\x21\x1B(Bdef', 'replace', 'ab\uFFFDdef'), + ('ab\u4FF1def', 'replace', b'ab?def'), + ) + xmlcharnametest = ( + '\xAB\u211C\xBB = \u2329\u1234\u232A', + b'\x1B$(O\x29\x28\x1B(Bℜ\x1B$(O\x29\x32\x1B(B = ⟨ሴ⟩' + ) + +class Test_ISO2022_JP2004(multibytecodec_support.TestBase, unittest.TestCase): + encoding = 'iso2022_jp_2004' + tstring = multibytecodec_support.load_teststring('iso2022_jp') + codectests = COMMON_CODEC_TESTS + ( + (b'ab\x1BNdef', 'replace', 'ab\x1BNdef'), + (b'\x1B$(Q\x2E\x23\x1B(B', 'strict', '\u3402' ), + (b'\x1B$(Q\x2E\x22\x1B(B', 'strict', '\U0002000B' ), + (b'\x1B$(Q\x24\x77\x1B(B', 'strict', '\u304B\u309A'), + (b'\x1B$(P\x21\x22\x1B(B', 'strict', '\u4E02' ), + (b'\x1B$(P\x7E\x76\x1B(B', 'strict', '\U0002A6B2' ), + ('\u3402', 'strict', b'\x1B$(Q\x2E\x23\x1B(B'), + ('\U0002000B', 'strict', b'\x1B$(Q\x2E\x22\x1B(B'), + ('\u304B\u309A', 'strict', b'\x1B$(Q\x24\x77\x1B(B'), + ('\u4E02', 'strict', b'\x1B$(P\x21\x22\x1B(B'), + ('\U0002A6B2', 'strict', b'\x1B$(P\x7E\x76\x1B(B'), + (b'ab\x1B$(Q\x2E\x21\x1B(Bdef', 'replace', 'ab\u4FF1def'), + ('ab\u4FF1def', 'replace', b'ab\x1B$(Q\x2E\x21\x1B(Bdef'), + ) + xmlcharnametest = ( + '\xAB\u211C\xBB = \u2329\u1234\u232A', + b'\x1B$(Q\x29\x28\x1B(Bℜ\x1B$(Q\x29\x32\x1B(B = ⟨ሴ⟩' + ) + class Test_ISO2022_KR(multibytecodec_support.TestBase, unittest.TestCase): encoding = 'iso2022_kr' tstring = multibytecodec_support.load_teststring('iso2022_kr') From 59e3e6f8f2b5a09b3e9f958b701fba4b479e0481 Mon Sep 17 00:00:00 2001 From: Masayuki Moriyama Date: Fri, 3 Nov 2023 21:40:08 +0900 Subject: [PATCH 3/3] gh-101180: Add NEWS --- .../2023-10-27-19-38-33.gh-issue-102388.vd5YUZ.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2023-10-27-19-38-33.gh-issue-102388.vd5YUZ.rst diff --git a/Misc/NEWS.d/next/Core and Builtins/2023-10-27-19-38-33.gh-issue-102388.vd5YUZ.rst b/Misc/NEWS.d/next/Core and Builtins/2023-10-27-19-38-33.gh-issue-102388.vd5YUZ.rst new file mode 100644 index 00000000000000..268a3d310f2b49 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2023-10-27-19-38-33.gh-issue-102388.vd5YUZ.rst @@ -0,0 +1 @@ +Fix a bug where ``iso2022_jp_3`` and ``iso2022_jp_2004`` codecs read out of bounds