diff --git a/Doc/library/importlib.rst b/Doc/library/importlib.rst index ea5a77028683b3..20ecf3cec1a4ca 100644 --- a/Doc/library/importlib.rst +++ b/Doc/library/importlib.rst @@ -256,6 +256,9 @@ ABC hierarchy:: :func:`importlib.util.spec_from_loader` may be useful for implementing concrete ``MetaPathFinders``. + *Fullname* must be normalized in NFKC to match the normalization + done by the Python parser. + .. versionadded:: 3.4 .. method:: invalidate_caches() @@ -290,6 +293,9 @@ ABC hierarchy:: guess about what spec to return. :func:`importlib.util.spec_from_loader` may be useful for implementing concrete ``PathEntryFinders``. + *Fullname* must be normalized in NFKC to match the normalization + done by the Python parser. + .. versionadded:: 3.4 .. method:: invalidate_caches() diff --git a/Lib/importlib/_bootstrap.py b/Lib/importlib/_bootstrap.py index 499da1e04efea8..c5528e30f27bd4 100644 --- a/Lib/importlib/_bootstrap.py +++ b/Lib/importlib/_bootstrap.py @@ -26,6 +26,13 @@ def _object_name(obj): except AttributeError: return type(obj).__qualname__ +def _normalize(name): + """Normalize 'name' to NKFC form.""" + global _unicodedata_normalize + if _unicodedata_normalize is None: + from unicodedata import normalize as _unicodedata_normalize + return _unicodedata_normalize('NFKC', name) + # Bootstrap-related code ###################################################### # Modules injected manually by _setup() @@ -36,6 +43,8 @@ def _object_name(obj): # Import done by _install_external_importers() _bootstrap_external = None +# Import done lazily as needed by _normalize as unicodedata is not built-in. +_unicodedata_normalize = None def _wrap(new, old): """Simple substitute for functools.update_wrapper.""" @@ -1392,7 +1401,15 @@ def _gcd_import(name, package=None, level=0): the loader did not. """ + global _unicodedata _sanity_check(name, package, level) + + if not name.isascii(): + name = _normalize(name) + + if package is not None and not package.isascii(): + package = _normalize(package) + if level > 0: name = _resolve_name(name, package, level) return _find_and_load(name, _gcd_import) diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index 8bcd741c446bd2..9ce7e2f68cc528 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -1345,8 +1345,9 @@ def __init__(self, path, *loader_details): else: self.path = _path_abspath(path) self._path_mtime = -1 - self._path_cache = set() - self._relaxed_path_cache = set() + self._path_cache = {} + self._relaxed_path_cache = {} + self._cache_is_normalized = False def invalidate_caches(self): """Invalidate the directory mtime.""" @@ -1372,6 +1373,8 @@ def find_spec(self, fullname, target=None): self._fill_cache() self._path_mtime = mtime # tail_module keeps the original casing, for __file__ and friends + if not tail_module.isascii() and not self._cache_is_normalized: + self._normalize_cache() if _relax_case(): cache = self._relaxed_path_cache cache_module = tail_module.lower() @@ -1379,8 +1382,12 @@ def find_spec(self, fullname, target=None): cache = self._path_cache cache_module = tail_module # Check if the module is the name of a directory (and thus a package). - if cache_module in cache: - base_path = _path_join(self.path, tail_module) + try: + cache_path = cache[cache_module] + except KeyError: + pass + else: + base_path = _path_join(self.path, cache_path) for suffix, loader_class in self._loaders: init_filename = '__init__' + suffix full_path = _path_join(base_path, init_filename) @@ -1392,15 +1399,21 @@ def find_spec(self, fullname, target=None): is_namespace = _path_isdir(base_path) # Check for a file w/ a proper suffix exists. for suffix, loader_class in self._loaders: + # XXX: Why is ValueError caught here? + #try: + # full_path = _path_join(self.path, tail_module + suffix) + #except ValueError: + # return None + _bootstrap._verbose_message('trying {}{} in {}', cache_module, suffix, self.path, verbosity=2) try: - full_path = _path_join(self.path, tail_module + suffix) - except ValueError: - return None - _bootstrap._verbose_message('trying {}', full_path, verbosity=2) - if cache_module + suffix in cache: + cache_path = cache[cache_module + suffix] + except KeyError: + pass + + else: + full_path = _path_join(self.path, cache_path) if _path_isfile(full_path): - return self._get_spec(loader_class, fullname, full_path, - None, target) + return self._get_spec(loader_class, fullname, full_path, None, target) if is_namespace: _bootstrap._verbose_message('possible namespace for {}', base_path) spec = _bootstrap.ModuleSpec(fullname, None) @@ -1420,24 +1433,35 @@ def _fill_cache(self): # We store two cached versions, to handle runtime changes of the # PYTHONCASEOK environment variable. if not sys.platform.startswith('win'): - self._path_cache = set(contents) + self._path_cache = { p: p for p in contents } else: # Windows users can import modules with case-insensitive file # suffixes (for legacy reasons). Make the suffix lowercase here # so it's done once instead of for every import. This is safe as # the specified suffixes to check against are always specified in a # case-sensitive manner. - lower_suffix_contents = set() + lower_suffix_contents = {} for item in contents: name, dot, suffix = item.partition('.') if dot: new_name = f'{name}.{suffix.lower()}' else: new_name = name - lower_suffix_contents.add(new_name) + lower_suffix_contents[new_name] = item self._path_cache = lower_suffix_contents if sys.platform.startswith(_CASE_INSENSITIVE_PLATFORMS): - self._relaxed_path_cache = {fn.lower() for fn in contents} + self._relaxed_path_cache = {fn.lower(): fn for fn in contents} + + self._cache_is_normalized = False + + def _normalize_cache(self): + """Normalize all entries in the caches to NFKC.""" + from unicodedata import normalize + + self._path_cache = { normalize('NFKC', p): p for p in self._path_cache } + self._relaxed_path_cache = { normalize('NFKC', p): p for p in self._relaxed_path_cache } + self._cache_is_normalized = True + @classmethod def path_hook(cls, *loader_details): diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py index 6e34094c5aa422..0ce9aaaa06d37e 100644 --- a/Lib/test/test_import/__init__.py +++ b/Lib/test/test_import/__init__.py @@ -23,6 +23,7 @@ import threading import time import types +import unicodedata import unittest from unittest import mock import _imp @@ -3372,6 +3373,47 @@ def test_magic_number_endianness(self): start = 2900 + sys.version_info.minor * 50 self.assertIn(magic_number, range(start, start + 50)) +class TestImportAccented(unittest.TestCase): + # XXX: There should be tests with PYTHONCASEOK as well + # (for those platforms where this is relevant) + dir_name = os.path.abspath(TESTFN) + + def setUp(self): + self.sys_path = sys.path[:] + os.mkdir(self.dir_name) + sys.path.insert(0, self.dir_name) + importlib.invalidate_caches() + + def tearDown(self): + sys.path[:] = self.sys_path + importlib.invalidate_caches() + rmtree(self.dir_name) + + def assert_importing_possible(self, name): + normalized = unicodedata.normalize('NFKC', name) + filename = os.path.join(self.dir_name, f"{name}.py") + with open(filename, "w") as stream: + stream.write("SPAM = 'spam'\n") + + values = {} + exec(f"from {name} import SPAM", values, values) + try: + self.assertEqual(values["SPAM"], "spam") + self.assertIn(normalized, sys.modules) + finally: + del sys.modules[normalized] + + def test_import_precomposed(self): + name = 'M\u00E4dchen' + self.assert_importing_possible(name) + + def test_import_normalized(self): + name = 'M\u0061\u0308dchen' + self.assert_importing_possible(name) + + def test_import_macos_input(self): + name = 'Mädchen' + self.assert_importing_possible(name) if __name__ == '__main__': # Test needs to be a package, so we can do relative imports. diff --git a/Lib/test/test_importlib/source/test_case_sensitivity.py b/Lib/test/test_importlib/source/test_case_sensitivity.py index e52829e628038a..35b8fb417534dc 100644 --- a/Lib/test/test_importlib/source/test_case_sensitivity.py +++ b/Lib/test/test_importlib/source/test_case_sensitivity.py @@ -59,7 +59,7 @@ def test_insensitive(self): self.assertIsNotNone(sensitive) self.assertIn(self.name, sensitive.get_filename(self.name)) self.assertIsNotNone(insensitive) - self.assertIn(self.name, insensitive.get_filename(self.name)) + self.assertIn(self.name.lower(), insensitive.get_filename(self.name)) class CaseSensitivityTestPEP451(CaseSensitivityTest): diff --git a/Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst b/Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst new file mode 100644 index 00000000000000..37e71fb10786be --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst @@ -0,0 +1,3 @@ +It is now possible to import modules from the filesystem regardless of how +the name is normalized in the filesystem. This in particular affects +importing modules with a name that contains accented latin characters.