Skip to content

Update test_unicodedata from CPython 3.11.2 #4678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 10, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 128 additions & 33 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
""" Test script for the unicodedata module.
""" Tests for the unicodedata module.

Written by Marc-Andre Lemburg (mal@lemburg.com).

(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.

"""

import hashlib
from http.client import HTTPException
import sys
import unicodedata
import unittest
import hashlib
from test.support import script_helper

encoding = 'utf-8'
errors = 'surrogatepass'
from test.support import (open_urlresource, requires_resource, script_helper,
cpython_only, check_disallow_instantiation,
ResourceDenied)


### Run tests

class UnicodeMethodsTest(unittest.TestCase):

# update this, if the database changes
expectedchecksum = '9129d6f2bdf008a81c2476e5b5127014a62130c1'
expectedchecksum = '4739770dd4d0e5f1b1677accfc3552ed3c8ef326'

# TODO: RUSTPYTHON
@unittest.expectedFailure
@requires_resource('cpu')
def test_method_checksum(self):
h = hashlib.sha1()
for i in range(0x10000):
for i in range(sys.maxunicode + 1):
char = chr(i)
data = [
# Predicates (single char)
Expand Down Expand Up @@ -63,33 +63,26 @@ def test_method_checksum(self):
(char + 'ABC').title(),

]
h.update(''.join(data).encode(encoding, errors))
h.update(''.join(data).encode('utf-8', 'surrogatepass'))
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)

class UnicodeDatabaseTest(unittest.TestCase):

def setUp(self):
# In case unicodedata is not available, this will raise an ImportError,
# but the other test cases will still be run
import unicodedata
self.db = unicodedata

def tearDown(self):
del self.db
db = unicodedata

class UnicodeFunctionsTest(UnicodeDatabaseTest):

# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = 'c44a49ca7c5cb6441640fe174ede604b45028652'
expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370'
# TODO: RUSTPYTHON
@unittest.expectedFailure
@requires_resource('cpu')
def test_function_checksum(self):
data = []
h = hashlib.sha1()

for i in range(0x10000):
for i in range(sys.maxunicode + 1):
char = chr(i)
data = [
# Properties
Expand All @@ -106,6 +99,15 @@ def test_function_checksum(self):
result = h.hexdigest()
self.assertEqual(result, self.expectedchecksum)

# TODO: RUSTPYTHON
@unittest.expectedFailure
@requires_resource('cpu')
def test_name_inverse_lookup(self):
for i in range(sys.maxunicode + 1):
char = chr(i)
if looked_name := self.db.name(char, None):
self.assertEqual(self.db.lookup(looked_name), char)

# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_digit(self):
Expand Down Expand Up @@ -201,15 +203,8 @@ def test_combining(self):
self.assertRaises(TypeError, self.db.combining)
self.assertRaises(TypeError, self.db.combining, 'xx')

def test_normalize(self):
self.assertRaises(TypeError, self.db.normalize)
self.assertRaises(ValueError, self.db.normalize, 'unknown', 'xx')
self.assertEqual(self.db.normalize('NFKC', ''), '')
# The rest can be found in test_normalization.py
# which requires an external file.

def test_pr29(self):
# http://www.unicode.org/review/pr-29.html
# https://www.unicode.org/review/pr-29.html
# See issues #1054943 and #10254.
composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
'Li\u030dt-s\u1e73\u0301',
Expand Down Expand Up @@ -240,9 +235,6 @@ def test_issue29456(self):
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)

# For tests of unicodedata.is_normalized / self.db.is_normalized ,
# see test_normalization.py .

def test_east_asian_width(self):
eaw = self.db.east_asian_width
self.assertRaises(TypeError, eaw, b'a')
Expand All @@ -265,6 +257,11 @@ def test_east_asian_width_9_0_changes(self):

class UnicodeMiscTest(UnicodeDatabaseTest):

@cpython_only
def test_disallow_instantiation(self):
# Ensure that the type disallows instantiation (bpo-43916)
check_disallow_instantiation(self, unicodedata.UCD)

# TODO: RUSTPYTHON
@unittest.expectedFailure
def test_failed_import_during_compiling(self):
Expand Down Expand Up @@ -363,5 +360,103 @@ def test_linebreak_7643(self):
self.assertEqual(len(lines), 1,
r"\u%.4x should not be a linebreak" % i)

class NormalizationTest(unittest.TestCase):
@staticmethod
def check_version(testfile):
hdr = testfile.readline()
return unicodedata.unidata_version in hdr

@staticmethod
def unistr(data):
data = [int(x, 16) for x in data.split(" ")]
return "".join([chr(x) for x in data])

@requires_resource('network')
def test_normalization(self):
TESTDATAFILE = "NormalizationTest.txt"
TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"

# Hit the exception early
try:
testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
check=self.check_version)
except PermissionError:
self.skipTest(f"Permission error when downloading {TESTDATAURL} "
f"into the test data directory")
except (OSError, HTTPException) as exc:
self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")

with testdata:
self.run_normalization_tests(testdata)

def run_normalization_tests(self, testdata):
part = None
part1_data = {}

def NFC(str):
return unicodedata.normalize("NFC", str)

def NFKC(str):
return unicodedata.normalize("NFKC", str)

def NFD(str):
return unicodedata.normalize("NFD", str)

def NFKD(str):
return unicodedata.normalize("NFKD", str)

for line in testdata:
if '#' in line:
line = line.split('#')[0]
line = line.strip()
if not line:
continue
if line.startswith("@Part"):
part = line.split()[0]
continue
c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]

# Perform tests
self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
self.assertTrue(c4 == NFC(c4) == NFC(c5), line)
self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
self.assertTrue(c5 == NFD(c4) == NFD(c5), line)
self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
NFKC(c3) == NFKC(c4) == NFKC(c5),
line)
self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
NFKD(c3) == NFKD(c4) == NFKD(c5),
line)

self.assertTrue(unicodedata.is_normalized("NFC", c2))
self.assertTrue(unicodedata.is_normalized("NFC", c4))

self.assertTrue(unicodedata.is_normalized("NFD", c3))
self.assertTrue(unicodedata.is_normalized("NFD", c5))

self.assertTrue(unicodedata.is_normalized("NFKC", c4))
self.assertTrue(unicodedata.is_normalized("NFKD", c5))

# Record part 1 data
if part == "@Part1":
part1_data[c1] = 1

# Perform tests for all other data
for c in range(sys.maxunicode+1):
X = chr(c)
if X in part1_data:
continue
self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)

def test_edge_cases(self):
self.assertRaises(TypeError, unicodedata.normalize)
self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
self.assertEqual(unicodedata.normalize('NFKC', ''), '')

def test_bug_834676(self):
# Check for bug 834676
unicodedata.normalize('NFC', '\ud55c\uae00')


if __name__ == "__main__":
unittest.main()