From b7ba38900fda11d65603f6d85adc664e17fef99f Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Sun, 24 Dec 2023 10:26:10 +0100 Subject: [PATCH 01/10] gh-84013: normalize directory contents during import Identifiers in Python code are normalized to NFKC and there is no guarantee that names in the file system are normalized. Teach the FileFinder class about normalization, but only do this when the to-be-imported name contains non-ASCII characters. --- Lib/importlib/_bootstrap_external.py | 32 +++++++++++++++++++++-- Lib/test/test_import/__init__.py | 38 ++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index d537598ac16433..d95818c14c0fc4 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -1594,6 +1594,12 @@ def __init__(self, path, *loader_details): self._path_cache = set() self._relaxed_path_cache = set() + # Cache in NFKC normalization for imports of non-ASCII + # names. The regular cache is not kept in NFKC format + # to avoid circular dependencies during startup. + self._norm_path_cache = None + self._norm_relaxed_path_cache = None + def invalidate_caches(self): """Invalidate the directory mtime.""" self._path_mtime = -1 @@ -1619,11 +1625,23 @@ def find_spec(self, fullname, target=None): self._path_mtime = mtime # tail_module keeps the original casing, for __file__ and friends if _relax_case(): - cache = self._relaxed_path_cache cache_module = tail_module.lower() + + if cache_module.isascii(): + cache = self._relaxed_path_cache + else: + if self._norm_relaxed_path_cache is None: + self._fill_norm_cache() + cache = self._norm_relaxed_path_cache else: - cache = self._path_cache cache_module = tail_module + + if cache_module.isascii(): + cache = self._path_cache + else: + if self._norm_path_cache is None: + self._fill_norm_cache() + cache = self._norm_path_cache # Check if the module is the name of a directory (and thus a package). if cache_module in cache: base_path = _path_join(self.path, tail_module) @@ -1685,6 +1703,16 @@ def _fill_cache(self): if sys.platform.startswith(_CASE_INSENSITIVE_PLATFORMS): self._relaxed_path_cache = {fn.lower() for fn in contents} + self._norm_path_cache = None + self._norm_relaxed_path_cache = None + + def _fill_norm_cache(self): + from unicodedata import normalize + + self._norm_path_cache = { normalize('NFKC', p) for p in self._path_cache } + self._norm_relaxed_path_cache = { normalize('NFKC', p) for p in self._relaxed_path_cache } + + @classmethod def path_hook(cls, *loader_details): """A class method which returns a closure to use on sys.path_hook diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py index 48c0a43f29e27f..8ac84d2c1a44b5 100644 --- a/Lib/test/test_import/__init__.py +++ b/Lib/test/test_import/__init__.py @@ -19,6 +19,7 @@ import threading import time import types +import unicodedata import unittest from unittest import mock import _imp @@ -2701,6 +2702,43 @@ def test_pyimport_addmodule_create(self): mod = _testcapi.check_pyimport_addmodule(name) self.assertIs(mod, sys.modules[name]) +class TestImportAccented(unittest.TestCase): + # XXX: There should be tests with PYTHONCASEOK as well + # (for those platforms where this is relevant) + dir_name = os.path.abspath(TESTFN) + + def setUp(self): + self.sys_path = sys.path[:] + os.mkdir(self.dir_name) + sys.path.insert(0, self.dir_name) + importlib.invalidate_caches() + + def tearDown(self): + sys.path[:] = self.sys_path + importlib.invalidate_caches() + rmtree(self.dir_name) + + def assert_importing_possible(self, name): + filename = os.path.join(self.dir_name, f"{name}.py") + with open(filename, "w") as stream: + stream.write("SPAM = 'spam'\n") + + values = {} + exec(f"from {name} import SPAM", values, values) + self.assertEqual(values["SPAM"], "spam") + del sys.modules[unicodedata.normalize('NFKC', name)] + + def test_import_precomposed(self): + name = 'M\u00E4dchen' + self.assert_importing_possible(name) + + def test_import_normalized(self): + name = 'M\u0061\u0308dchen' + self.assert_importing_possible(name) + + def test_import_macos_input(self): + name = 'Mädchen' + self.assert_importing_possible(name) if __name__ == '__main__': # Test needs to be a package, so we can do relative imports. From 7f4b01babca189a757701438dcc3b6789d369871 Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Sun, 24 Dec 2023 10:38:44 +0100 Subject: [PATCH 02/10] Slightly cleaner code --- Lib/importlib/_bootstrap_external.py | 36 ++++++++++------------------ 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index d95818c14c0fc4..622b54ad947caf 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -1593,12 +1593,7 @@ def __init__(self, path, *loader_details): self._path_mtime = -1 self._path_cache = set() self._relaxed_path_cache = set() - - # Cache in NFKC normalization for imports of non-ASCII - # names. The regular cache is not kept in NFKC format - # to avoid circular dependencies during startup. - self._norm_path_cache = None - self._norm_relaxed_path_cache = None + self._cache_is_normalized = False def invalidate_caches(self): """Invalidate the directory mtime.""" @@ -1626,22 +1621,17 @@ def find_spec(self, fullname, target=None): # tail_module keeps the original casing, for __file__ and friends if _relax_case(): cache_module = tail_module.lower() + cache = self._relaxed_path_cache - if cache_module.isascii(): - cache = self._relaxed_path_cache - else: - if self._norm_relaxed_path_cache is None: - self._fill_norm_cache() - cache = self._norm_relaxed_path_cache + if not cache_module.isascii() and not self._cache_is_normalized: + self._normalize_cache() else: cache_module = tail_module + cache = self._path_cache + + if cache_module.isascii() and not self._cache_is_normalized: + self._normalize_cache() - if cache_module.isascii(): - cache = self._path_cache - else: - if self._norm_path_cache is None: - self._fill_norm_cache() - cache = self._norm_path_cache # Check if the module is the name of a directory (and thus a package). if cache_module in cache: base_path = _path_join(self.path, tail_module) @@ -1703,14 +1693,14 @@ def _fill_cache(self): if sys.platform.startswith(_CASE_INSENSITIVE_PLATFORMS): self._relaxed_path_cache = {fn.lower() for fn in contents} - self._norm_path_cache = None - self._norm_relaxed_path_cache = None + self._cache_is_normalized = False - def _fill_norm_cache(self): + def _normalize_cache(self): from unicodedata import normalize - self._norm_path_cache = { normalize('NFKC', p) for p in self._path_cache } - self._norm_relaxed_path_cache = { normalize('NFKC', p) for p in self._relaxed_path_cache } + self._path_cache = { normalize('NFKC', p) for p in self._path_cache } + self._relaxed_path_cache = { normalize('NFKC', p) for p in self._relaxed_path_cache } + self._cache_is_normalized = True @classmethod From 6ea9497fd710f210b402d0fb7acc1d95cfd8fa6b Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Mon, 25 Dec 2023 10:57:03 +0100 Subject: [PATCH 03/10] Argh... Previous cleanup was a bit too quick... --- Lib/importlib/_bootstrap_external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index 622b54ad947caf..cde4692e93738f 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -1629,7 +1629,7 @@ def find_spec(self, fullname, target=None): cache_module = tail_module cache = self._path_cache - if cache_module.isascii() and not self._cache_is_normalized: + if not cache_module.isascii() and not self._cache_is_normalized: self._normalize_cache() # Check if the module is the name of a directory (and thus a package). From c216e08405dc5d99b275275a8c4432db63b32dab Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Mon, 25 Dec 2023 11:17:36 +0100 Subject: [PATCH 04/10] Actually fix the "cleaned up" version --- Lib/importlib/_bootstrap_external.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index cde4692e93738f..cad89273dbcf8a 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -1619,19 +1619,14 @@ def find_spec(self, fullname, target=None): self._fill_cache() self._path_mtime = mtime # tail_module keeps the original casing, for __file__ and friends + if not tail_module.isascii() and not self._cache_is_normalized: + self._normalize_cache() if _relax_case(): cache_module = tail_module.lower() cache = self._relaxed_path_cache - - if not cache_module.isascii() and not self._cache_is_normalized: - self._normalize_cache() else: cache_module = tail_module cache = self._path_cache - - if not cache_module.isascii() and not self._cache_is_normalized: - self._normalize_cache() - # Check if the module is the name of a directory (and thus a package). if cache_module in cache: base_path = _path_join(self.path, tail_module) @@ -1696,6 +1691,9 @@ def _fill_cache(self): self._cache_is_normalized = False def _normalize_cache(self): + """ + Normalize all entries in the caches to NFKC. + """ from unicodedata import normalize self._path_cache = { normalize('NFKC', p) for p in self._path_cache } From 3f16ae1f9148c674a843a735d04b0d1278ed8b75 Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Fri, 29 Dec 2023 10:35:22 +0100 Subject: [PATCH 05/10] Change of implementation The finder cache is now a mapping from (normalized) names to actual filesystem names. With some luck this fixes test failures on Linux and Windows. --- Lib/importlib/_bootstrap_external.py | 44 ++++++++++++------- .../source/test_case_sensitivity.py | 2 +- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index cad89273dbcf8a..63bdef7c370659 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -1591,8 +1591,8 @@ def __init__(self, path, *loader_details): else: self.path = _path_abspath(path) self._path_mtime = -1 - self._path_cache = set() - self._relaxed_path_cache = set() + self._path_cache = dict() + self._relaxed_path_cache = dict() self._cache_is_normalized = False def invalidate_caches(self): @@ -1628,8 +1628,12 @@ def find_spec(self, fullname, target=None): cache_module = tail_module cache = self._path_cache # Check if the module is the name of a directory (and thus a package). - if cache_module in cache: - base_path = _path_join(self.path, tail_module) + try: + cache_path = cache[cache_module] + except KeyError: + pass + else: + base_path = _path_join(self.path, cache_path) for suffix, loader_class in self._loaders: init_filename = '__init__' + suffix full_path = _path_join(base_path, init_filename) @@ -1641,15 +1645,21 @@ def find_spec(self, fullname, target=None): is_namespace = _path_isdir(base_path) # Check for a file w/ a proper suffix exists. for suffix, loader_class in self._loaders: + # XXX: Why is ValueError caught here? + #try: + # full_path = _path_join(self.path, tail_module + suffix) + #except ValueError: + # return None + _bootstrap._verbose_message('trying {}{} in {}', cache_module, suffix, self.path, verbosity=2) try: - full_path = _path_join(self.path, tail_module + suffix) - except ValueError: - return None - _bootstrap._verbose_message('trying {}', full_path, verbosity=2) - if cache_module + suffix in cache: + cache_path = cache[cache_module + suffix] + except KeyError: + pass + + else: + full_path = _path_join(self.path, cache_path) if _path_isfile(full_path): - return self._get_spec(loader_class, fullname, full_path, - None, target) + return self._get_spec(loader_class, fullname, full_path, None, target) if is_namespace: _bootstrap._verbose_message('possible namespace for {}', base_path) spec = _bootstrap.ModuleSpec(fullname, None) @@ -1669,24 +1679,24 @@ def _fill_cache(self): # We store two cached versions, to handle runtime changes of the # PYTHONCASEOK environment variable. if not sys.platform.startswith('win'): - self._path_cache = set(contents) + self._path_cache = { p: p for p in contents } else: # Windows users can import modules with case-insensitive file # suffixes (for legacy reasons). Make the suffix lowercase here # so it's done once instead of for every import. This is safe as # the specified suffixes to check against are always specified in a # case-sensitive manner. - lower_suffix_contents = set() + lower_suffix_contents = dict() for item in contents: name, dot, suffix = item.partition('.') if dot: new_name = f'{name}.{suffix.lower()}' else: new_name = name - lower_suffix_contents.add(new_name) + lower_suffix_contents[new_name] = item self._path_cache = lower_suffix_contents if sys.platform.startswith(_CASE_INSENSITIVE_PLATFORMS): - self._relaxed_path_cache = {fn.lower() for fn in contents} + self._relaxed_path_cache = {fn.lower(): fn for fn in contents} self._cache_is_normalized = False @@ -1696,8 +1706,8 @@ def _normalize_cache(self): """ from unicodedata import normalize - self._path_cache = { normalize('NFKC', p) for p in self._path_cache } - self._relaxed_path_cache = { normalize('NFKC', p) for p in self._relaxed_path_cache } + self._path_cache = { normalize('NFKC', p): p for p in self._path_cache } + self._relaxed_path_cache = { normalize('NFKC', p): p for p in self._relaxed_path_cache } self._cache_is_normalized = True diff --git a/Lib/test/test_importlib/source/test_case_sensitivity.py b/Lib/test/test_importlib/source/test_case_sensitivity.py index e52829e628038a..35b8fb417534dc 100644 --- a/Lib/test/test_importlib/source/test_case_sensitivity.py +++ b/Lib/test/test_importlib/source/test_case_sensitivity.py @@ -59,7 +59,7 @@ def test_insensitive(self): self.assertIsNotNone(sensitive) self.assertIn(self.name, sensitive.get_filename(self.name)) self.assertIsNotNone(insensitive) - self.assertIn(self.name, insensitive.get_filename(self.name)) + self.assertIn(self.name.lower(), insensitive.get_filename(self.name)) class CaseSensitivityTestPEP451(CaseSensitivityTest): From 80b6eec9bcac6436bd2d065bc77b87bb50a7bffd Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Fri, 29 Dec 2023 12:57:14 +0100 Subject: [PATCH 06/10] Revert accidental reordering --- Lib/importlib/_bootstrap_external.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index 63bdef7c370659..57f7756a4123ff 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -1622,11 +1622,11 @@ def find_spec(self, fullname, target=None): if not tail_module.isascii() and not self._cache_is_normalized: self._normalize_cache() if _relax_case(): - cache_module = tail_module.lower() cache = self._relaxed_path_cache + cache_module = tail_module.lower() else: - cache_module = tail_module cache = self._path_cache + cache_module = tail_module # Check if the module is the name of a directory (and thus a package). try: cache_path = cache[cache_module] From b9e8927b7289b4ca5c017cc6f2a20abb618a5cf0 Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Fri, 29 Dec 2023 13:01:13 +0100 Subject: [PATCH 07/10] Update Lib/test/test_import/__init__.py Co-authored-by: Eric Snow --- Lib/test/test_import/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py index 8ac84d2c1a44b5..c04d6c4d27f983 100644 --- a/Lib/test/test_import/__init__.py +++ b/Lib/test/test_import/__init__.py @@ -2719,14 +2719,17 @@ def tearDown(self): rmtree(self.dir_name) def assert_importing_possible(self, name): + normalized = unicodedata.normalize('NFKC', name) filename = os.path.join(self.dir_name, f"{name}.py") with open(filename, "w") as stream: stream.write("SPAM = 'spam'\n") values = {} exec(f"from {name} import SPAM", values, values) - self.assertEqual(values["SPAM"], "spam") - del sys.modules[unicodedata.normalize('NFKC', name)] + try: + self.assertEqual(values["SPAM"], "spam") + finally: + del sys.modules[normalized] def test_import_precomposed(self): name = 'M\u00E4dchen' From 7d2d2bf597b84517fe46035b90efb2770d3bf11f Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Fri, 29 Dec 2023 13:05:00 +0100 Subject: [PATCH 08/10] Address review comment --- Lib/test/test_import/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py index c04d6c4d27f983..f0265e2297ab1b 100644 --- a/Lib/test/test_import/__init__.py +++ b/Lib/test/test_import/__init__.py @@ -2728,6 +2728,7 @@ def assert_importing_possible(self, name): exec(f"from {name} import SPAM", values, values) try: self.assertEqual(values["SPAM"], "spam") + self.assertIn(normalized, sys.modules) finally: del sys.modules[normalized] From 18d99ae0096a13f4bf754c6690407d6137b14e51 Mon Sep 17 00:00:00 2001 From: Ronald Oussoren Date: Fri, 29 Dec 2023 13:13:07 +0100 Subject: [PATCH 09/10] Some changes 1. Add a NEWS entry 2. __import__ and find_module will now normalize their arguments as needed 3. Update the importlib documentation to note that the name passed to finders must be normalized. --- Doc/library/importlib.rst | 6 ++++++ Lib/importlib/_bootstrap.py | 17 +++++++++++++++++ ...023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst | 3 +++ 3 files changed, 26 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst diff --git a/Doc/library/importlib.rst b/Doc/library/importlib.rst index 2402bc5cd3ee2c..d9cb3aef92764f 100644 --- a/Doc/library/importlib.rst +++ b/Doc/library/importlib.rst @@ -256,6 +256,9 @@ ABC hierarchy:: :func:`importlib.util.spec_from_loader` may be useful for implementing concrete ``MetaPathFinders``. + *Fullname* must be normalized in NFKC to match the normalization + done by the Python parser. + .. versionadded:: 3.4 .. method:: invalidate_caches() @@ -290,6 +293,9 @@ ABC hierarchy:: guess about what spec to return. :func:`importlib.util.spec_from_loader` may be useful for implementing concrete ``PathEntryFinders``. + *Fullname* must be normalized in NFKC to match the normalization + done by the Python parser. + .. versionadded:: 3.4 .. method:: invalidate_caches() diff --git a/Lib/importlib/_bootstrap.py b/Lib/importlib/_bootstrap.py index d942045f3de666..6f20471c5d0e4c 100644 --- a/Lib/importlib/_bootstrap.py +++ b/Lib/importlib/_bootstrap.py @@ -26,6 +26,13 @@ def _object_name(obj): except AttributeError: return type(obj).__qualname__ +def _normalize(name): + """ Normalize 'name' in NKFC form """ + global _unicodedata_normalize + if _unicodedata_normalize is None: + from unicodedata import _normalize + return _unicodedata_normalize('NFKC', name) + # Bootstrap-related code ###################################################### # Modules injected manually by _setup() @@ -36,6 +43,8 @@ def _object_name(obj): # Import done by _install_external_importers() _bootstrap_external = None +# Import done as needed by _normalize +_unicodedata_normalize = None def _wrap(new, old): """Simple substitute for functools.update_wrapper.""" @@ -1381,7 +1390,15 @@ def _gcd_import(name, package=None, level=0): the loader did not. """ + global _unicodedata _sanity_check(name, package, level) + + if not name.isascii(): + name = _normalize(name) + + if package is not None and not package.isascii(): + package = _normalize(package) + if level > 0: name = _resolve_name(name, package, level) return _find_and_load(name, _gcd_import) diff --git a/Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst b/Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst new file mode 100644 index 00000000000000..37e71fb10786be --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-12-29-13-09-44.gh-issue-84013.rUaxkr.rst @@ -0,0 +1,3 @@ +It is now possible to import modules from the filesystem regardless of how +the name is normalized in the filesystem. This in particular affects +importing modules with a name that contains accented latin characters. From 985e50a817d748bb7318c6f144c9523ffbbd4a0a Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Wed, 21 May 2025 14:32:16 +0300 Subject: [PATCH 10/10] Apply suggestions from code review Co-authored-by: Brett Cannon --- Lib/importlib/_bootstrap.py | 6 +++--- Lib/importlib/_bootstrap_external.py | 10 ++++------ 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/Lib/importlib/_bootstrap.py b/Lib/importlib/_bootstrap.py index 6f20471c5d0e4c..1012e8e36986d2 100644 --- a/Lib/importlib/_bootstrap.py +++ b/Lib/importlib/_bootstrap.py @@ -27,10 +27,10 @@ def _object_name(obj): return type(obj).__qualname__ def _normalize(name): - """ Normalize 'name' in NKFC form """ + """Normalize 'name' to NKFC form.""" global _unicodedata_normalize if _unicodedata_normalize is None: - from unicodedata import _normalize + from unicodedata import normalize as _unicodedata_normalize return _unicodedata_normalize('NFKC', name) # Bootstrap-related code ###################################################### @@ -43,7 +43,7 @@ def _normalize(name): # Import done by _install_external_importers() _bootstrap_external = None -# Import done as needed by _normalize +# Import done lazily as needed by _normalize as unicodedata is not built-in. _unicodedata_normalize = None def _wrap(new, old): diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py index 57f7756a4123ff..543865106f6fc3 100644 --- a/Lib/importlib/_bootstrap_external.py +++ b/Lib/importlib/_bootstrap_external.py @@ -1591,8 +1591,8 @@ def __init__(self, path, *loader_details): else: self.path = _path_abspath(path) self._path_mtime = -1 - self._path_cache = dict() - self._relaxed_path_cache = dict() + self._path_cache = {} + self._relaxed_path_cache = {} self._cache_is_normalized = False def invalidate_caches(self): @@ -1686,7 +1686,7 @@ def _fill_cache(self): # so it's done once instead of for every import. This is safe as # the specified suffixes to check against are always specified in a # case-sensitive manner. - lower_suffix_contents = dict() + lower_suffix_contents = {} for item in contents: name, dot, suffix = item.partition('.') if dot: @@ -1701,9 +1701,7 @@ def _fill_cache(self): self._cache_is_normalized = False def _normalize_cache(self): - """ - Normalize all entries in the caches to NFKC. - """ + """Normalize all entries in the caches to NFKC.""" from unicodedata import normalize self._path_cache = { normalize('NFKC', p): p for p in self._path_cache }