From b5d125bd6d15720c6994e8e2261fcf2aaae5ca9f Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sun, 5 Mar 2017 16:58:24 +1000 Subject: [PATCH 01/36] WIP: PEP 538 reference implementation - new PYTHONCOERCECLOCALE config setting - coerces legacy C locale to C.UTF-8, C.utf8 or UTF-8 by default TODO: - configure option to disable locale coercion at build time - configure option to disable C locale warning at build time - skip runtime locale warning on Mac OS X --- Doc/using/cmdline.rst | 29 ++++++ Lib/test/support/script_helper.py | 56 ++++++----- Lib/test/test_locale_coercion.py | 158 ++++++++++++++++++++++++++++++ Programs/python.c | 113 +++++++++++++++++++++ Python/pylifecycle.c | 16 +++ 5 files changed, 346 insertions(+), 26 deletions(-) create mode 100644 Lib/test/test_locale_coercion.py diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 195f63f0a319da..1faf8fb27069df 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -713,6 +713,35 @@ conflict. .. versionadded:: 3.6 + +.. envvar:: PYTHONCOERCECLOCALE + + If set to a non-empty string, causes the main Python command line application + to skip coercing the legacy ASCII-based C locale to a more capable UTF-8 + based alternative. Note that this setting is checked even when the + :option:`-E` or :option:`-I` options are used, as it is handled prior to + the processing of command line options. + + If this variable is *not* set, and the current locale reported for the + ``LC_CTYPE`` category is the default ``C`` locale, then the Python CLI will + attempt to configure one of the following locales for the given locale + categories before loading the interpreter runtime: + + * ``C.UTF-8` (``LC_ALL``) + * ``C.utf8` (``LC_ALL``) + * ``UTF-8` (``LC_CTYPE``) + + If setting one of these locale categories succeeds, then the matching + environment variables will be set (both ``LC_ALL` and ``LANG`` for the + ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category), + and (if not already set to a non-empty string) :envvar:`PYTHONIOENCODING` + will be set to ``utf-8:surrogateescape``. + + Availability: \*nix + + .. versionadded:: 3.7 + See :pep:`538` for more details. + Debug-mode variables ~~~~~~~~~~~~~~~~~~~~ diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py index 1e746472ee6348..10eeccdbd086a3 100644 --- a/Lib/test/support/script_helper.py +++ b/Lib/test/support/script_helper.py @@ -48,8 +48,35 @@ def interpreter_requires_environment(): return __cached_interp_requires_environment -_PythonRunResult = collections.namedtuple("_PythonRunResult", - ("rc", "out", "err")) +class _PythonRunResult(collections.namedtuple("_PythonRunResult", + ("rc", "out", "err"))): + """Helper for reporting Python subprocess run results""" + def fail(self, cmd_line): + """Provide helpful details about failed subcommand runs""" + # Limit to 80 lines to ASCII characters + maxlen = 80 * 100 + out, err = res.out, res.err + if len(out) > maxlen: + out = b'(... truncated stdout ...)' + out[-maxlen:] + if len(err) > maxlen: + err = b'(... truncated stderr ...)' + err[-maxlen:] + out = out.decode('ascii', 'replace').rstrip() + err = err.decode('ascii', 'replace').rstrip() + raise AssertionError("Process return code is %d\n" + "command line: %r\n" + "\n" + "stdout:\n" + "---\n" + "%s\n" + "---\n" + "\n" + "stderr:\n" + "---\n" + "%s\n" + "---" + % (res.rc, cmd_line, + out, + err)) # Executing the interpreter in a subprocess @@ -107,30 +134,7 @@ def run_python_until_end(*args, **env_vars): def _assert_python(expected_success, *args, **env_vars): res, cmd_line = run_python_until_end(*args, **env_vars) if (res.rc and expected_success) or (not res.rc and not expected_success): - # Limit to 80 lines to ASCII characters - maxlen = 80 * 100 - out, err = res.out, res.err - if len(out) > maxlen: - out = b'(... truncated stdout ...)' + out[-maxlen:] - if len(err) > maxlen: - err = b'(... truncated stderr ...)' + err[-maxlen:] - out = out.decode('ascii', 'replace').rstrip() - err = err.decode('ascii', 'replace').rstrip() - raise AssertionError("Process return code is %d\n" - "command line: %r\n" - "\n" - "stdout:\n" - "---\n" - "%s\n" - "---\n" - "\n" - "stderr:\n" - "---\n" - "%s\n" - "---" - % (res.rc, cmd_line, - out, - err)) + res.fail(cmd_line) return res def assert_python_ok(*args, **env_vars): diff --git a/Lib/test/test_locale_coercion.py b/Lib/test/test_locale_coercion.py new file mode 100644 index 00000000000000..eac795cff6c980 --- /dev/null +++ b/Lib/test/test_locale_coercion.py @@ -0,0 +1,158 @@ +# Tests the attempted automatic coercion of the C locale to a UTF-8 locale + +import unittest +import sys +import shutil +import subprocess +import test.support +from test.support.script_helper import ( + run_python_until_end, + interpreter_requires_environment, +) + +# In order to get the warning messages to match up as expected, the candidate +# order here must much the target locale order in Programs/python.c +_C_UTF8_LOCALES = ( + # Entries: (Target locale, target category, expected env var updates) + ("C.UTF-8", "LC_ALL", "LC_ALL & LANG"), + ("C.utf8", "LC_ALL", "LC_ALL & LANG"), + ("UTF-8", "LC_CTYPE", "LC_CTYPE"), +) + +# There's no reliable cross-platform way of checking locale alias +# lists, so the only way of knowing which of these locales will work +# is to try them with locale.setlocale(). We do that in a subprocess +# to avoid altering the locale of the test runner. +def _set_locale_in_subprocess(locale_name, category): + cmd_fmt = "import locale; print(locale.setlocale(locale.{}, '{}'))" + cmd = cmd_fmt.format(category, locale_name) + result, py_cmd = run_python_until_end("-c", cmd, __isolated=True) + return result.rc == 0 + +# Details of the warnings emitted at runtime +CLI_COERCION_WARNING_FMT = ( + "Python detected LC_CTYPE=C, {} set to {} (set another locale or " + "PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour)." +) + +LIBRARY_C_LOCALE_WARNING = ( + "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " + "encoding), which may cause Unicode compatibility problems. Using C.UTF-8 " + "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " + "locales is recommended." +) + +@test.support.cpython_only +class LocaleOverrideTest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + for target_locale, target_category, env_updates in _C_UTF8_LOCALES: + if _set_locale_in_subprocess(target_locale, target_category): + break + else: + raise unittest.SkipTest("No C-with-UTF-8 locale available") + cls.EXPECTED_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format( + env_updates, target_locale + ) + + def _get_child_fsencoding(self, env_vars): + """Retrieves sys.getfilesystemencoding() from a child process + + Returns (fsencoding, stderr_lines): + + - fsencoding: a lowercase str value with the child's fsencoding + - stderr_lines: result of calling splitlines() on the stderr output + + The child is run in isolated mode if the current interpreter supports + that. + """ + cmd = "import sys; print(sys.getfilesystemencoding().lower())" + result, py_cmd = run_python_until_end( + "-c", cmd, + __isolated=True, + **env_vars + ) + if not result.rc == 0: + result.fail(py_cmd) + # All subprocess outputs in this test case should be pure ASCII + child_fsencoding = result.out.decode("ascii").rstrip() + child_stderr_lines = result.err.decode("ascii").rstrip().splitlines() + return child_fsencoding, child_stderr_lines + + + def test_C_utf8_locale(self): + # Ensure the C.UTF-8 locale is accepted entirely without complaint + base_var_dict = { + "LANG": "", + "LC_CTYPE": "", + "LC_ALL": "", + } + for env_var in ("LC_ALL", "LC_CTYPE", "LANG"): + with self.subTest(env_var=env_var): + var_dict = base_var_dict.copy() + var_dict[env_var] = "C.UTF-8" + fsencoding, stderr_lines = self._get_child_fsencoding(var_dict) + self.assertEqual(fsencoding, "utf-8") + self.assertFalse(stderr_lines) + + + def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): + """Check the handling of the C locale for various configurations + + Parameters: + expected_fsencoding: the encoding the child is expected to report + allow_c_locale: setting to use for PYTHONALLOWCLOCALE + None: don't set the variable at all + str: the value set in the child's environment + """ + if coerce_c_locale == "0": + # Check the library emits a warning + expected_warning = [ + LIBRARY_C_LOCALE_WARNING, + ] + else: + # Check C locale is coerced with a warning on stderr + expected_warning = [ + self.EXPECTED_COERCION_WARNING, + ] + base_var_dict = { + "LANG": "", + "LC_CTYPE": "", + "LC_ALL": "", + } + for env_var in ("LC_ALL", "LC_CTYPE", "LANG"): + for locale_to_set in ("", "C", "POSIX", "invalid.ascii"): + with self.subTest(env_var=env_var, + nominal_locale=locale_to_set, + PYTHONCOERCECLOCALE=coerce_c_locale): + var_dict = base_var_dict.copy() + var_dict[env_var] = locale_to_set + if coerce_c_locale is not None: + var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale + fsencoding, stderr_lines = self._get_child_fsencoding(var_dict) + self.assertEqual(fsencoding, expected_fsencoding) + self.assertEqual(stderr_lines, expected_warning) + + + def test_test_PYTHONCOERCECLOCALE_not_set(self): + # This should coerce to the C.UTF-8 locale by default + self._check_c_locale_coercion("utf-8", coerce_c_locale=None) + + def test_PYTHONCOERCECLOCALE_not_zero(self): + # *Any* string other that "0" is considered "set" for our purposes + # and hence should result in the locale coercion being enabled + for setting in ("", "1", "true", "false"): + self._check_c_locale_coercion("utf-8", coerce_c_locale=setting) + + def test_PYTHONCOERCECLOCALE_set_to_zero(self): + # The setting "0" should result in the locale coercion being disabled + self._check_c_locale_coercion("ascii", coerce_c_locale="0") + + +def test_main(): + test.support.run_unittest(LocaleOverrideTest) + test.support.reap_children() + +if __name__ == "__main__": + test_main() diff --git a/Programs/python.c b/Programs/python.c index a7afbc774b3a55..7a10fc5ed1d6f0 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -15,6 +15,108 @@ wmain(int argc, wchar_t **argv) } #else +/* Helpers to better handle the legacy C locale + * + * The legacy C locale assumes ASCII as the default text encoding, which + * causes problems not only for the CPython runtime, but also other + * components like GNU readline. + * + * Accordingly, when the CLI detects it, it attempts to coerce it to a + * more capable UTF-8 based alternative. + * + * See the documentation of the PYTHONCOERCECLOCALE setting for more details. + * + */ + +static const char *_C_LOCALE_COERCION_WARNING = + "Python detected LC_CTYPE=C, %.20s set to %.20s (set another locale or " + "PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour).\n"; + +typedef struct _CandidateLocale { + const char *locale_name; + int category; +} _LocaleCoercionTarget; + +static _LocaleCoercionTarget _TARGET_LOCALES[] = { + { "C.UTF-8", LC_ALL }, + { "C.utf8", LC_ALL }, + { "UTF-8", LC_CTYPE }, + { NULL, 0 } +}; + +void +_coerce_default_locale_settings(const _LocaleCoercionTarget *target) +{ + const char *newloc = target->locale_name; + int category = target->category; + + /* Reset locale back to currently configured defaults */ + setlocale(LC_ALL, ""); + + /* Set the relevant locale environment variables */ + if (category == LC_ALL) { + const char *env_vars_updated = "LC_ALL & LANG"; + if (setenv("LC_ALL", newloc, 1)) { + fprintf(stderr, + "Error setting LC_ALL, skipping C locale coercion\n"); + return; + } + if (setenv("LANG", newloc, 1)) { + fprintf(stderr, + "Error setting LANG during C locale coercion\n"); + env_vars_updated = "LC_ALL"; + } + fprintf(stderr, _C_LOCALE_COERCION_WARNING, env_vars_updated, newloc); + } else if (category == LC_CTYPE) { + if (setenv("LC_CTYPE", newloc, 1)) { + fprintf(stderr, + "Error setting LC_CTYPE, skipping C locale coercion\n"); + return; + } + fprintf(stderr, _C_LOCALE_COERCION_WARNING, "LC_CTYPE", newloc); + } else { + fprintf(stderr, "Locale coercion must target LC_ALL or LC_CTYPE\n"); + return; + } + + /* Set PYTHONIOENCODING if not already set */ + if (setenv("PYTHONIOENCODING", "utf-8:surrogateescape", 0)) { + fprintf(stderr, + "Error setting PYTHONIOENCODING during C locale coercion\n"); + } + + /* Reconfigure with the overridden environment variables */ + setlocale(LC_ALL, ""); +} + +void +_handle_legacy_c_locale(void) +{ + const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); + /* We ignore the Python -E and -I flags here, as we need to sort out + * the locale settings *before* we try to do anything with the command + * line arguments. For cross-platform debugging purposes, we also need + * to give end users a way to force even scripts that are otherwise + * isolated from their environment to use the legacy ASCII-centric C + * locale. + */ + if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { + /* PYTHONCOERCECLOCALE is not set, or is not set to exactly "0" */ + const _LocaleCoercionTarget *target = NULL; + for (target = _TARGET_LOCALES; target->locale_name; target++) { + const char *reconfigured_locale = setlocale(target->category, + target->locale_name); + if (reconfigured_locale != NULL) { + /* Successfully configured locale, so make it the default */ + _coerce_default_locale_settings(target); + return; + } + } + + } + /* No C locale warning here, as Py_Initialize will emit one later */ +} + int main(int argc, char **argv) { @@ -23,6 +125,7 @@ main(int argc, char **argv) wchar_t **argv_copy2; int i, res; char *oldloc; + const char *ctype_loc; /* Force malloc() allocator to bootstrap Python */ (void)_PyMem_SetupAllocators("malloc"); @@ -49,7 +152,17 @@ main(int argc, char **argv) return 1; } + /* Reconfigure the locale to the default for this process */ setlocale(LC_ALL, ""); + + /* When the LC_CTYPE category still claims to be using the C locale, + assume configuration error and try for a UTF-8 based locale instead */ + ctype_loc = setlocale(LC_CTYPE, NULL); + if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { + _handle_legacy_c_locale(); + } + + /* Convert from char to wchar_t based on the locale settings */ for (i = 0; i < argc; i++) { argv_copy[i] = Py_DecodeLocale(argv[i], NULL); if (!argv_copy[i]) { diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 06030c330a082c..e6fb06f0f08d84 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -302,6 +302,21 @@ import_init(PyInterpreterState *interp, PyObject *sysmod) } +static const char *_C_LOCALE_WARNING = + "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " + "encoding), which may cause Unicode compatibility problems. Using C.UTF-8 " + "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " + "locales is recommended.\n"; + +static void +_emit_stderr_warning_for_c_locale(void) +{ + const char *ctype_loc = setlocale(LC_CTYPE, NULL); + if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { + fprintf(stderr, _C_LOCALE_WARNING); + } +} + void _Py_InitializeEx_Private(int install_sigs, int install_importlib) { @@ -321,6 +336,7 @@ _Py_InitializeEx_Private(int install_sigs, int install_importlib) the locale's charset without having to switch locales. */ setlocale(LC_CTYPE, ""); + _emit_stderr_warning_for_c_locale(); #endif if ((p = Py_GETENV("PYTHONDEBUG")) && *p != '\0') From 78c17a7cea04aed7cd1fce8ae5afb085a544a89c Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 11 Mar 2017 13:38:39 +1000 Subject: [PATCH 02/36] Fix test case failures --- Lib/test/support/script_helper.py | 11 +++++++++-- Lib/test/test_cmd_line.py | 12 ++++++++---- Lib/test/test_locale_coercion.py | 12 +++--------- Lib/test/test_sys.py | 10 ++++++++-- Python/pylifecycle.c | 2 +- 5 files changed, 29 insertions(+), 18 deletions(-) diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py index 10eeccdbd086a3..86c96d075e54e1 100644 --- a/Lib/test/support/script_helper.py +++ b/Lib/test/support/script_helper.py @@ -14,6 +14,13 @@ from test.support import make_legacy_pyc, strip_python_stderr +RUNTIME_C_LOCALE_WARNING = ( + "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " + "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " + "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " + "locales is recommended." +) + # Cached result of the expensive test performed in the function below. __cached_interp_requires_environment = None @@ -55,7 +62,7 @@ def fail(self, cmd_line): """Provide helpful details about failed subcommand runs""" # Limit to 80 lines to ASCII characters maxlen = 80 * 100 - out, err = res.out, res.err + out, err = self.out, self.err if len(out) > maxlen: out = b'(... truncated stdout ...)' + out[-maxlen:] if len(err) > maxlen: @@ -74,7 +81,7 @@ def fail(self, cmd_line): "---\n" "%s\n" "---" - % (res.rc, cmd_line, + % (self.rc, cmd_line, out, err)) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 958d282a428898..0188338281b077 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -8,8 +8,10 @@ import subprocess import tempfile from test.support import script_helper, is_android -from test.support.script_helper import (spawn_python, kill_python, assert_python_ok, - assert_python_failure) +from test.support.script_helper import ( + spawn_python, kill_python, assert_python_ok, assert_python_failure, + RUNTIME_C_LOCALE_WARNING +) # XXX (ncoghlan): Move to script_helper and make consistent with run_python @@ -150,6 +152,7 @@ def test_undecodable_code(self): env = os.environ.copy() # Use C locale to get ascii for the locale encoding env['LC_ALL'] = 'C' + env['PYTHONCOERCECLOCALE'] = '0' code = ( b'import locale; ' b'print(ascii("' + undecodable + b'"), ' @@ -159,17 +162,18 @@ def test_undecodable_code(self): stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) stdout, stderr = p.communicate() + pattern = RUNTIME_C_LOCALE_WARNING.encode() + b'\n' if p.returncode == 1: # _Py_char2wchar() decoded b'\xff' as '\udcff' (b'\xff' is not # decodable from ASCII) and run_command() failed on # PyUnicode_AsUTF8String(). This is the expected behaviour on # Linux. - pattern = b"Unable to decode the command from the command line:" + pattern += b"Unable to decode the command from the command line:" elif p.returncode == 0: # _Py_char2wchar() decoded b'\xff' as '\xff' even if the locale is # C and the locale encoding is ASCII. It occurs on FreeBSD, Solaris # and Mac OS X. - pattern = b"'\\xff' " + pattern += b"'\\xff' " # The output is followed by the encoding name, an alias to ASCII. # Examples: "US-ASCII" or "646" (ISO 646, on Solaris). else: diff --git a/Lib/test/test_locale_coercion.py b/Lib/test/test_locale_coercion.py index eac795cff6c980..bcc9b9a1fa83e9 100644 --- a/Lib/test/test_locale_coercion.py +++ b/Lib/test/test_locale_coercion.py @@ -8,6 +8,7 @@ from test.support.script_helper import ( run_python_until_end, interpreter_requires_environment, + RUNTIME_C_LOCALE_WARNING ) # In order to get the warning messages to match up as expected, the candidate @@ -29,19 +30,12 @@ def _set_locale_in_subprocess(locale_name, category): result, py_cmd = run_python_until_end("-c", cmd, __isolated=True) return result.rc == 0 -# Details of the warnings emitted at runtime +# Details of the CLI warning emitted at runtime CLI_COERCION_WARNING_FMT = ( "Python detected LC_CTYPE=C, {} set to {} (set another locale or " "PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour)." ) -LIBRARY_C_LOCALE_WARNING = ( - "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " - "encoding), which may cause Unicode compatibility problems. Using C.UTF-8 " - "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " - "locales is recommended." -) - @test.support.cpython_only class LocaleOverrideTest(unittest.TestCase): @@ -109,7 +103,7 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): if coerce_c_locale == "0": # Check the library emits a warning expected_warning = [ - LIBRARY_C_LOCALE_WARNING, + RUNTIME_C_LOCALE_WARNING, ] else: # Check C locale is coerced with a warning on stderr diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index e6d8e5082ffc04..f42de537cb6ba2 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -1,5 +1,8 @@ import unittest, test.support -from test.support.script_helper import assert_python_ok, assert_python_failure +from test.support.script_helper import ( + assert_python_ok, assert_python_failure, + RUNTIME_C_LOCALE_WARNING +) import sys, io, os import struct import subprocess @@ -676,10 +679,13 @@ def test_getfilesystemencoding(self): expected = None self.check_fsencoding(fs_encoding, expected) + _SKIP_RUNTIME_C_LOCALE_WARNING = len(RUNTIME_C_LOCALE_WARNING + "\n") + def c_locale_get_error_handler(self, isolated=False, encoding=None): # Force the POSIX locale env = os.environ.copy() env["LC_ALL"] = "C" + env["PYTHONCOERCECLOCALE"] = "0" code = '\n'.join(( 'import sys', 'def dump(name):', @@ -702,7 +708,7 @@ def c_locale_get_error_handler(self, isolated=False, encoding=None): env=env, universal_newlines=True) stdout, stderr = p.communicate() - return stdout + return stdout[self._SKIP_RUNTIME_C_LOCALE_WARNING:] def test_c_locale_surrogateescape(self): out = self.c_locale_get_error_handler(isolated=True) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index e6fb06f0f08d84..4dd9ec50e633e2 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -304,7 +304,7 @@ import_init(PyInterpreterState *interp, PyObject *sysmod) static const char *_C_LOCALE_WARNING = "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " - "encoding), which may cause Unicode compatibility problems. Using C.UTF-8 " + "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " "locales is recommended.\n"; From d283de1992e0b2b1498d63fc68ffb2ef40463de2 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Mon, 13 Mar 2017 14:00:05 +1000 Subject: [PATCH 03/36] Clarify locale coercion warnings --- Lib/test/test_locale_coercion.py | 4 ++-- Programs/python.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_locale_coercion.py b/Lib/test/test_locale_coercion.py index bcc9b9a1fa83e9..61ef41dfeb1ccc 100644 --- a/Lib/test/test_locale_coercion.py +++ b/Lib/test/test_locale_coercion.py @@ -32,8 +32,8 @@ def _set_locale_in_subprocess(locale_name, category): # Details of the CLI warning emitted at runtime CLI_COERCION_WARNING_FMT = ( - "Python detected LC_CTYPE=C, {} set to {} (set another locale or " - "PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour)." + "Python detected LC_CTYPE=C: {} coerced to {} (set another locale " + "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour)." ) @test.support.cpython_only diff --git a/Programs/python.c b/Programs/python.c index 7a10fc5ed1d6f0..7d039c760d83fe 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -29,8 +29,8 @@ wmain(int argc, wchar_t **argv) */ static const char *_C_LOCALE_COERCION_WARNING = - "Python detected LC_CTYPE=C, %.20s set to %.20s (set another locale or " - "PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour).\n"; + "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale " + "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour).\n"; typedef struct _CandidateLocale { const char *locale_name; From f7a03fe842bd749375568fedd78bc1c3eb7aff67 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Mon, 13 Mar 2017 14:00:26 +1000 Subject: [PATCH 04/36] Avoid -Wformat-security warning --- Python/pylifecycle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index c51b9349f4791c..55a292022051e6 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -313,7 +313,7 @@ _emit_stderr_warning_for_c_locale(void) { const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { - fprintf(stderr, _C_LOCALE_WARNING); + fprintf(stderr, "%s", _C_LOCALE_WARNING); } } From fe92a294145568a9f9ae17e90c99400ac16f5297 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Mon, 13 Mar 2017 14:35:22 +1000 Subject: [PATCH 05/36] Support running tests under 'LANG=C' --- Lib/test/test_capi.py | 9 +++++---- Programs/_testembed.c | 3 +++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py index 2a53f3d081ff70..ece84af5daa30a 100644 --- a/Lib/test/test_capi.py +++ b/Lib/test/test_capi.py @@ -386,7 +386,7 @@ def run_embedded_interpreter(self, *args): def test_subinterps(self): # This is just a "don't crash" test out, err = self.run_embedded_interpreter("repeated_init_and_subinterpreters") - if support.verbose: + if support.verbose > 1: print() print(out) print(err) @@ -404,14 +404,15 @@ def _get_default_pipe_encoding(): def test_forced_io_encoding(self): # Checks forced configuration of embedded interpreter IO streams out, err = self.run_embedded_interpreter("forced_io_encoding") - if support.verbose: + if support.verbose > 1: print() print(out) print(err) - expected_errors = sys.__stdout__.errors - expected_stdin_encoding = sys.__stdin__.encoding + expected_errors = "surrogateescape" + expected_stdin_encoding = "UTF-8" expected_pipe_encoding = self._get_default_pipe_encoding() expected_output = '\n'.join([ + "Setting PYTHONIOENCODING=UTF-8:surrogateescape", "--- Use defaults ---", "Expected encoding: default", "Expected errors: default", diff --git a/Programs/_testembed.c b/Programs/_testembed.c index a68d4fa25f7cd0..3f1c96c0763837 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -106,6 +106,9 @@ static void check_stdio_details(const char *encoding, const char * errors) static int test_forced_io_encoding(void) { + /* Ensure consistent "defaults" */ + printf("Setting PYTHONIOENCODING=UTF-8:surrogateescape\n"); + setenv("PYTHONIOENCODING", "UTF-8:surrogateescape", 1); /* Check various combinations */ printf("--- Use defaults ---\n"); check_stdio_details(NULL, NULL); From 64d9d2f250c3c7c7e28d2bb4577262923b6a0882 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Mon, 13 Mar 2017 14:35:49 +1000 Subject: [PATCH 06/36] Suppress locale warning for PYTHONCOERCECLOCALE=0 --- Lib/test/support/script_helper.py | 7 ------- Lib/test/test_cmd_line.py | 8 +++----- Lib/test/test_locale_coercion.py | 21 +++++++++++---------- Lib/test/test_sys.py | 9 ++------- Python/pylifecycle.c | 14 +++++++++++--- 5 files changed, 27 insertions(+), 32 deletions(-) diff --git a/Lib/test/support/script_helper.py b/Lib/test/support/script_helper.py index 86c96d075e54e1..b3ac848f08252b 100644 --- a/Lib/test/support/script_helper.py +++ b/Lib/test/support/script_helper.py @@ -14,13 +14,6 @@ from test.support import make_legacy_pyc, strip_python_stderr -RUNTIME_C_LOCALE_WARNING = ( - "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " - "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " - "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " - "locales is recommended." -) - # Cached result of the expensive test performed in the function below. __cached_interp_requires_environment = None diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 0188338281b077..c4c6850a620c0d 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -9,8 +9,7 @@ import tempfile from test.support import script_helper, is_android from test.support.script_helper import ( - spawn_python, kill_python, assert_python_ok, assert_python_failure, - RUNTIME_C_LOCALE_WARNING + spawn_python, kill_python, assert_python_ok, assert_python_failure ) @@ -162,18 +161,17 @@ def test_undecodable_code(self): stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) stdout, stderr = p.communicate() - pattern = RUNTIME_C_LOCALE_WARNING.encode() + b'\n' if p.returncode == 1: # _Py_char2wchar() decoded b'\xff' as '\udcff' (b'\xff' is not # decodable from ASCII) and run_command() failed on # PyUnicode_AsUTF8String(). This is the expected behaviour on # Linux. - pattern += b"Unable to decode the command from the command line:" + pattern = b"Unable to decode the command from the command line:" elif p.returncode == 0: # _Py_char2wchar() decoded b'\xff' as '\xff' even if the locale is # C and the locale encoding is ASCII. It occurs on FreeBSD, Solaris # and Mac OS X. - pattern += b"'\\xff' " + pattern = b"'\\xff' " # The output is followed by the encoding name, an alias to ASCII. # Examples: "US-ASCII" or "646" (ISO 646, on Solaris). else: diff --git a/Lib/test/test_locale_coercion.py b/Lib/test/test_locale_coercion.py index 61ef41dfeb1ccc..0756298cecd079 100644 --- a/Lib/test/test_locale_coercion.py +++ b/Lib/test/test_locale_coercion.py @@ -8,7 +8,6 @@ from test.support.script_helper import ( run_python_until_end, interpreter_requires_environment, - RUNTIME_C_LOCALE_WARNING ) # In order to get the warning messages to match up as expected, the candidate @@ -30,6 +29,14 @@ def _set_locale_in_subprocess(locale_name, category): result, py_cmd = run_python_until_end("-c", cmd, __isolated=True) return result.rc == 0 +# Details of the shared library warning emitted at runtime +LIBRARY_C_LOCALE_WARNING = ( + "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " + "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " + "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " + "locales is recommended." +) + # Details of the CLI warning emitted at runtime CLI_COERCION_WARNING_FMT = ( "Python detected LC_CTYPE=C: {} coerced to {} (set another locale " @@ -100,16 +107,10 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): None: don't set the variable at all str: the value set in the child's environment """ - if coerce_c_locale == "0": - # Check the library emits a warning - expected_warning = [ - RUNTIME_C_LOCALE_WARNING, - ] - else: + expected_warning = [] + if coerce_c_locale != "0": # Check C locale is coerced with a warning on stderr - expected_warning = [ - self.EXPECTED_COERCION_WARNING, - ] + expected_warning.append(self.EXPECTED_COERCION_WARNING) base_var_dict = { "LANG": "", "LC_CTYPE": "", diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index f42de537cb6ba2..952182b226e5b4 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -1,8 +1,5 @@ import unittest, test.support -from test.support.script_helper import ( - assert_python_ok, assert_python_failure, - RUNTIME_C_LOCALE_WARNING -) +from test.support.script_helper import assert_python_ok, assert_python_failure import sys, io, os import struct import subprocess @@ -679,8 +676,6 @@ def test_getfilesystemencoding(self): expected = None self.check_fsencoding(fs_encoding, expected) - _SKIP_RUNTIME_C_LOCALE_WARNING = len(RUNTIME_C_LOCALE_WARNING + "\n") - def c_locale_get_error_handler(self, isolated=False, encoding=None): # Force the POSIX locale env = os.environ.copy() @@ -708,7 +703,7 @@ def c_locale_get_error_handler(self, isolated=False, encoding=None): env=env, universal_newlines=True) stdout, stderr = p.communicate() - return stdout[self._SKIP_RUNTIME_C_LOCALE_WARNING:] + return stdout def test_c_locale_surrogateescape(self): out = self.c_locale_get_error_handler(isolated=True) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 55a292022051e6..bb3396cc68c001 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -311,9 +311,17 @@ static const char *_C_LOCALE_WARNING = static void _emit_stderr_warning_for_c_locale(void) { - const char *ctype_loc = setlocale(LC_CTYPE, NULL); - if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { - fprintf(stderr, "%s", _C_LOCALE_WARNING); + const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); + /* We don't emit a warning if locale coercion has been explicitly disabled. + * + * For consistency with the corresponding check in Programs/python.c + * we ignore the Python -E and -I flags here. + */ + if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { + const char *ctype_loc = setlocale(LC_CTYPE, NULL); + if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { + fprintf(stderr, "%s", _C_LOCALE_WARNING); + } } } From 384a1466a283c144665c91d86e088331765797d1 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Mon, 13 Mar 2017 15:05:29 +1000 Subject: [PATCH 07/36] Add test case for library runtime warning --- Lib/test/test_locale_coercion.py | 63 +++++++++++++++++++++++++++----- Programs/_testembed.c | 11 ++++++ 2 files changed, 64 insertions(+), 10 deletions(-) diff --git a/Lib/test/test_locale_coercion.py b/Lib/test/test_locale_coercion.py index 0756298cecd079..2752b494ec65bc 100644 --- a/Lib/test/test_locale_coercion.py +++ b/Lib/test/test_locale_coercion.py @@ -1,6 +1,7 @@ # Tests the attempted automatic coercion of the C locale to a UTF-8 locale import unittest +import os import sys import shutil import subprocess @@ -29,22 +30,15 @@ def _set_locale_in_subprocess(locale_name, category): result, py_cmd = run_python_until_end("-c", cmd, __isolated=True) return result.rc == 0 -# Details of the shared library warning emitted at runtime -LIBRARY_C_LOCALE_WARNING = ( - "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " - "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " - "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " - "locales is recommended." -) - # Details of the CLI warning emitted at runtime CLI_COERCION_WARNING_FMT = ( "Python detected LC_CTYPE=C: {} coerced to {} (set another locale " "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour)." ) +# TODO: Make this conditional on the PY_COERCE_C_LOCALE sysconfig var @test.support.cpython_only -class LocaleOverrideTest(unittest.TestCase): +class LocaleOverrideTests(unittest.TestCase): @classmethod def setUpClass(cls): @@ -145,8 +139,57 @@ def test_PYTHONCOERCECLOCALE_set_to_zero(self): self._check_c_locale_coercion("ascii", coerce_c_locale="0") +# Details of the shared library warning emitted at runtime +LIBRARY_C_LOCALE_WARNING = ( + "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " + "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " + "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " + "locales is recommended.\n" +) + +# TODO: Make this conditional on the PY_WARN_ON_C_LOCALE sysconfig var +class EmbeddingTests(unittest.TestCase): + def setUp(self): + here = os.path.abspath(__file__) + basepath = os.path.dirname(os.path.dirname(os.path.dirname(here))) + self.test_exe = exe = os.path.join(basepath, "Programs", "_testembed") + if not os.path.exists(exe): + self.skipTest("%r doesn't exist" % exe) + # This is needed otherwise we get a fatal error: + # "Py_Initialize: Unable to get the locale encoding + # LookupError: no codec search functions registered: can't find encoding" + self.oldcwd = os.getcwd() + os.chdir(basepath) + + def tearDown(self): + os.chdir(self.oldcwd) + + def run_embedded_interpreter(self, *args): + """Runs a test in the embedded interpreter""" + cmd = [self.test_exe] + cmd.extend(args) + p = subprocess.Popen(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True) + (out, err) = p.communicate() + self.assertEqual(p.returncode, 0, + "bad returncode %d, stderr is %r" % + (p.returncode, err)) + return out, err + + def test_library_c_locale_warning(self): + # Checks forced configuration of embedded interpreter IO streams + out, err = self.run_embedded_interpreter("c_locale_warning") + if test.support.verbose > 1: + print() + print(out) + print(err) + self.assertEqual(out, "") + self.assertEqual(err, LIBRARY_C_LOCALE_WARNING) + def test_main(): - test.support.run_unittest(LocaleOverrideTest) + test.support.run_unittest(LocaleOverrideTests, EmbeddingTests) test.support.reap_children() if __name__ == "__main__": diff --git a/Programs/_testembed.c b/Programs/_testembed.c index 3f1c96c0763837..7acf989528891b 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -129,6 +129,16 @@ static int test_forced_io_encoding(void) return 0; } +static int test_c_locale_warning(void) +{ + /* Force use of the C locale */ + setenv("LC_ALL", "C", 1); + + _testembed_Py_Initialize(); + Py_Finalize(); + return 0; +} + /* ********************************************************* * List of test cases and the function that implements it. * @@ -150,6 +160,7 @@ struct TestCase static struct TestCase TestCases[] = { { "forced_io_encoding", test_forced_io_encoding }, { "repeated_init_and_subinterpreters", test_repeated_init_and_subinterpreters }, + { "c_locale_warning", test_c_locale_warning }, { NULL, NULL } }; From b4f3a34753df0f9710508b6cda237c6aeab2e4c2 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Mon, 13 Mar 2017 15:59:01 +1000 Subject: [PATCH 08/36] Add C locale coercion and warning build flags * --with(out)-c-locale-coercion for PY_COERCE_C_LOCALE * --with(out)-c-locale-warning for PY_WARN_ON_C_LOCALE --- Lib/test/test_locale_coercion.py | 7 +++-- Programs/_testembed.c | 7 ++++- Programs/python.c | 7 +++-- Python/pylifecycle.c | 4 +++ configure | 54 ++++++++++++++++++++++++++++++++ configure.ac | 34 ++++++++++++++++++++ pyconfig.h.in | 6 ++++ 7 files changed, 114 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_locale_coercion.py b/Lib/test/test_locale_coercion.py index 2752b494ec65bc..06a3753dc95ecc 100644 --- a/Lib/test/test_locale_coercion.py +++ b/Lib/test/test_locale_coercion.py @@ -3,6 +3,7 @@ import unittest import os import sys +import sysconfig import shutil import subprocess import test.support @@ -36,8 +37,9 @@ def _set_locale_in_subprocess(locale_name, category): "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour)." ) -# TODO: Make this conditional on the PY_COERCE_C_LOCALE sysconfig var @test.support.cpython_only +@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), + "C locale coercion disabled at build time") class LocaleOverrideTests(unittest.TestCase): @classmethod @@ -147,7 +149,8 @@ def test_PYTHONCOERCECLOCALE_set_to_zero(self): "locales is recommended.\n" ) -# TODO: Make this conditional on the PY_WARN_ON_C_LOCALE sysconfig var +@unittest.skipUnless(sysconfig.get_config_var("PY_WARN_ON_C_LOCALE"), + "C locale runtime warning disabled at build time") class EmbeddingTests(unittest.TestCase): def setUp(self): here = os.path.abspath(__file__) diff --git a/Programs/_testembed.c b/Programs/_testembed.c index 7acf989528891b..1494452dd7f61e 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -1,4 +1,5 @@ -#include +#include "Python.h" +#include "pyconfig.h" #include /********************************************************* @@ -131,11 +132,15 @@ static int test_forced_io_encoding(void) static int test_c_locale_warning(void) { +#ifdef PY_WARN_ON_C_LOCALE /* Force use of the C locale */ setenv("LC_ALL", "C", 1); _testembed_Py_Initialize(); Py_Finalize(); +#else + printf("C locale compatibility warning disabled at compile time\n"); +#endif return 0; } diff --git a/Programs/python.c b/Programs/python.c index 7d039c760d83fe..bd91d83e9cc0c0 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -28,6 +28,7 @@ wmain(int argc, wchar_t **argv) * */ +#ifdef PY_COERCE_C_LOCALE static const char *_C_LOCALE_COERCION_WARNING = "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale " "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour).\n"; @@ -116,6 +117,7 @@ _handle_legacy_c_locale(void) } /* No C locale warning here, as Py_Initialize will emit one later */ } +#endif int main(int argc, char **argv) @@ -125,7 +127,6 @@ main(int argc, char **argv) wchar_t **argv_copy2; int i, res; char *oldloc; - const char *ctype_loc; /* Force malloc() allocator to bootstrap Python */ (void)_PyMem_SetupAllocators("malloc"); @@ -155,12 +156,14 @@ main(int argc, char **argv) /* Reconfigure the locale to the default for this process */ setlocale(LC_ALL, ""); +#ifdef PY_COERCE_C_LOCALE /* When the LC_CTYPE category still claims to be using the C locale, assume configuration error and try for a UTF-8 based locale instead */ - ctype_loc = setlocale(LC_CTYPE, NULL); + const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { _handle_legacy_c_locale(); } +#endif /* Convert from char to wchar_t based on the locale settings */ for (i = 0; i < argc; i++) { diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index bb3396cc68c001..335e758d6e856a 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -302,6 +302,7 @@ import_init(PyInterpreterState *interp, PyObject *sysmod) } +#ifdef PY_WARN_ON_C_LOCALE static const char *_C_LOCALE_WARNING = "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " @@ -324,6 +325,7 @@ _emit_stderr_warning_for_c_locale(void) } } } +#endif void _Py_InitializeEx_Private(int install_sigs, int install_importlib) @@ -344,7 +346,9 @@ _Py_InitializeEx_Private(int install_sigs, int install_importlib) the locale's charset without having to switch locales. */ setlocale(LC_CTYPE, ""); +#ifdef PY_WARN_ON_C_LOCALE _emit_stderr_warning_for_c_locale(); +#endif #endif if ((p = Py_GETENV("PYTHONDEBUG")) && *p != '\0') diff --git a/configure b/configure index 67cd4f4fa37f29..718bf3ad6b80db 100755 --- a/configure +++ b/configure @@ -834,6 +834,8 @@ with_thread enable_ipv6 with_doc_strings with_pymalloc +with_c_locale_coercion +with_c_locale_warning with_valgrind with_dtrace with_fpectl @@ -1527,6 +1529,12 @@ Optional Packages: deprecated; use --with(out)-threads --with(out)-doc-strings disable/enable documentation strings --with(out)-pymalloc disable/enable specialized mallocs + --with(out)-c-locale-coercion + disable/enable C locale coercion to a UTF-8 based + locale + --with(out)-c-locale-warning + disable/enable locale compatibility warning in the C + locale --with-valgrind Enable Valgrind support --with(out)-dtrace disable/enable DTrace support --with-fpectl enable SIGFPE catching @@ -11055,6 +11063,52 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_pymalloc" >&5 $as_echo "$with_pymalloc" >&6; } +# Check for --with-c-locale-coercion +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-coercion" >&5 +$as_echo_n "checking for --with-c-locale-coercion... " >&6; } + +# Check whether --with-c-locale-coercion was given. +if test "${with_c_locale_coercion+set}" = set; then : + withval=$with_c_locale_coercion; +fi + + +if test -z "$with_c_locale_coercion" +then + with_c_locale_coercion="yes" +fi +if test "$with_c_locale_coercion" != "no" +then + +$as_echo "#define PY_COERCE_C_LOCALE 1" >>confdefs.h + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_coercion" >&5 +$as_echo "$with_c_locale_coercion" >&6; } + +# Check for --with-c-locale-warning +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-c-locale-warning" >&5 +$as_echo_n "checking for --with-c-locale-warning... " >&6; } + +# Check whether --with-c-locale-warning was given. +if test "${with_c_locale_warning+set}" = set; then : + withval=$with_c_locale_warning; +fi + + +if test -z "$with_c_locale_warning" +then + with_c_locale_warning="yes" +fi +if test "$with_c_locale_warning" != "no" +then + +$as_echo "#define PY_WARN_ON_C_LOCALE 1" >>confdefs.h + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_c_locale_warning" >&5 +$as_echo "$with_c_locale_warning" >&6; } + # Check for Valgrind support { $as_echo "$as_me:${as_lineno-$LINENO}: checking for --with-valgrind" >&5 $as_echo_n "checking for --with-valgrind... " >&6; } diff --git a/configure.ac b/configure.ac index 6e78bb64d7e951..df0aba4bdb1684 100644 --- a/configure.ac +++ b/configure.ac @@ -3301,6 +3301,40 @@ then fi AC_MSG_RESULT($with_pymalloc) +# Check for --with-c-locale-coercion +AC_MSG_CHECKING(for --with-c-locale-coercion) +AC_ARG_WITH(c-locale-coercion, + AS_HELP_STRING([--with(out)-c-locale-coercion], + [disable/enable C locale coercion to a UTF-8 based locale])) + +if test -z "$with_c_locale_coercion" +then + with_c_locale_coercion="yes" +fi +if test "$with_c_locale_coercion" != "no" +then + AC_DEFINE(PY_COERCE_C_LOCALE, 1, + [Define if you want to coerce the C locale to a UTF-8 based locale]) +fi +AC_MSG_RESULT($with_c_locale_coercion) + +# Check for --with-c-locale-warning +AC_MSG_CHECKING(for --with-c-locale-warning) +AC_ARG_WITH(c-locale-warning, + AS_HELP_STRING([--with(out)-c-locale-warning], + [disable/enable locale compatibility warning in the C locale])) + +if test -z "$with_c_locale_warning" +then + with_c_locale_warning="yes" +fi +if test "$with_c_locale_warning" != "no" +then + AC_DEFINE(PY_WARN_ON_C_LOCALE, 1, + [Define to emit a locale compatibility warning in the C locale]) +fi +AC_MSG_RESULT($with_c_locale_warning) + # Check for Valgrind support AC_MSG_CHECKING([for --with-valgrind]) AC_ARG_WITH([valgrind], diff --git a/pyconfig.h.in b/pyconfig.h.in index 21354a5cb84fe5..37142a1c628253 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -1247,9 +1247,15 @@ /* Define as the preferred size in bits of long digits */ #undef PYLONG_BITS_IN_DIGIT +/* Define if you want to coerce the C locale to a UTF-8 based locale */ +#undef PY_COERCE_C_LOCALE + /* Define to printf format modifier for Py_ssize_t */ #undef PY_FORMAT_SIZE_T +/* Define to emit a locale compatibility warning in the C locale */ +#undef PY_WARN_ON_C_LOCALE + /* Define if you want to build an interpreter with many run-time checks. */ #undef Py_DEBUG From 4d684a65593ad46d67b1547c4e16f16c6ff09986 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Mon, 13 Mar 2017 17:21:52 +1000 Subject: [PATCH 09/36] Always use C.UTF-8 on Android --- Programs/python.c | 7 +++++++ Python/pylifecycle.c | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Programs/python.c b/Programs/python.c index bd91d83e9cc0c0..b5edebb8fef827 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -153,8 +153,15 @@ main(int argc, char **argv) return 1; } +#ifdef __ANDROID__ + /* Passing "" to setlocale() on Android requests the C locale rather + * than checking environment variables, so request C.UTF-8 explicitly + */ + setlocale(LC_ALL, "C.UTF-8"); +#else /* Reconfigure the locale to the default for this process */ setlocale(LC_ALL, ""); +#endif #ifdef PY_COERCE_C_LOCALE /* When the LC_CTYPE category still claims to be using the C locale, diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 335e758d6e856a..692c0e451fed74 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -341,7 +341,12 @@ _Py_InitializeEx_Private(int install_sigs, int install_importlib) initialized = 1; _Py_Finalizing = NULL; -#ifdef HAVE_SETLOCALE +#ifdef __ANDROID__ + /* Passing "" to setlocale() on Android requests the C locale rather + * than checking environment variables, so request C.UTF-8 explicitly + */ + setlocale(LC_CTYPE, "C.UTF-8"); +#else /* Set up the LC_CTYPE locale, so we can obtain the locale's charset without having to switch locales. */ From 1c3a2706ef276ca66f676218395b9d792ef3ca0e Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 17:14:51 +1000 Subject: [PATCH 10/36] Fix PYTHONCOERCECLOCALE docs --- Doc/using/cmdline.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index b40f5bc339c648..62b8f70a69f127 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -716,16 +716,17 @@ conflict. .. envvar:: PYTHONCOERCECLOCALE - If set to a non-empty string, causes the main Python command line application + If set to the value ``0``, causes the main Python command line application to skip coercing the legacy ASCII-based C locale to a more capable UTF-8 based alternative. Note that this setting is checked even when the :option:`-E` or :option:`-I` options are used, as it is handled prior to the processing of command line options. - If this variable is *not* set, and the current locale reported for the - ``LC_CTYPE`` category is the default ``C`` locale, then the Python CLI will - attempt to configure one of the following locales for the given locale - categories before loading the interpreter runtime: + If this variable is *not* set, or is set to a value other than ``0``, and + the current locale reported for the ``LC_CTYPE`` category is the default + ``C`` locale, then the Python CLI will attempt to configure one of the + following locales for the given locale categories before loading the + interpreter runtime: * ``C.UTF-8` (``LC_ALL``) * ``C.utf8` (``LC_ALL``) From 7626fcf1e00c05329696c377928cb5e7b398cf99 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 17:24:44 +1000 Subject: [PATCH 11/36] Use Py_SetStandardStreamEncoding instead of PYTHONIOENCODING - setting PYTHONIOENCODING has unintended side effects on Python 2 instances run in a subprocess (since Python 2 has no `surrogateescape` error handler - Py_SetStandardStreamEncoding enables surrogateescape for the current process without any side effects on subprocesses --- Doc/using/cmdline.rst | 5 +++-- Programs/python.c | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 62b8f70a69f127..0f99ee4d620431 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -735,8 +735,9 @@ conflict. If setting one of these locale categories succeeds, then the matching environment variables will be set (both ``LC_ALL` and ``LANG`` for the ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category), - and (if not already set to a non-empty string) :envvar:`PYTHONIOENCODING` - will be set to ``utf-8:surrogateescape``. + and (if :envvar:`PYTHONIOENCODING` is not explicitly set), the text encoding + and error handling for Python's standard streams in the current process + will be set to ``utf-8`` and ``surrogateescape`` respectively. Availability: \*nix diff --git a/Programs/python.c b/Programs/python.c index b5edebb8fef827..6d96b9173cc510 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -80,10 +80,18 @@ _coerce_default_locale_settings(const _LocaleCoercionTarget *target) return; } - /* Set PYTHONIOENCODING if not already set */ - if (setenv("PYTHONIOENCODING", "utf-8:surrogateescape", 0)) { - fprintf(stderr, - "Error setting PYTHONIOENCODING during C locale coercion\n"); + /* Set standard stream encoding if PYTHONIOENCODING is not set + * + * We avoid setting PYTHONIOENCODING, as that can confuse Python 2 + * instances in subprocesses that inherit the environment (as Python + * 2 has no 'surrogateescape' error handler). + * + * If PEP 540 is also implemented, this check will be replaced with + * unconditionally setting PYTHONUTF8=1 + */ + const char *io_encoding = getenv("PYTHONIOENCODING"); + if ((io_encoding == NULL) || (strnlen(io_encoding, 1) == 0)) { + Py_SetStandardStreamEncoding("utf-8", "surrogateescape"); } /* Reconfigure with the overridden environment variables */ From d12b4126779b41c945c210124ba23b1c25ab62f5 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 17:32:12 +1000 Subject: [PATCH 12/36] Some test cleanups suggested by Barry --- Lib/test/test_locale_coercion.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_locale_coercion.py b/Lib/test/test_locale_coercion.py index 06a3753dc95ecc..27d7623a6f34b9 100644 --- a/Lib/test/test_locale_coercion.py +++ b/Lib/test/test_locale_coercion.py @@ -85,7 +85,7 @@ def test_C_utf8_locale(self): "LC_CTYPE": "", "LC_ALL": "", } - for env_var in ("LC_ALL", "LC_CTYPE", "LANG"): + for env_var in base_var_dict: with self.subTest(env_var=env_var): var_dict = base_var_dict.copy() var_dict[env_var] = "C.UTF-8" @@ -112,7 +112,7 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): "LC_CTYPE": "", "LC_ALL": "", } - for env_var in ("LC_ALL", "LC_CTYPE", "LANG"): + for env_var in base_var_dict: for locale_to_set in ("", "C", "POSIX", "invalid.ascii"): with self.subTest(env_var=env_var, nominal_locale=locale_to_set, @@ -161,12 +161,9 @@ def setUp(self): # This is needed otherwise we get a fatal error: # "Py_Initialize: Unable to get the locale encoding # LookupError: no codec search functions registered: can't find encoding" - self.oldcwd = os.getcwd() + self.addCleanup(os.chdir, os.getcwd()) os.chdir(basepath) - def tearDown(self): - os.chdir(self.oldcwd) - def run_embedded_interpreter(self, *args): """Runs a test in the embedded interpreter""" cmd = [self.test_exe] From ec4f2eaeef1fb395ed078b8bd8cd24a41e0e9a06 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 17:34:57 +1000 Subject: [PATCH 13/36] Use more precise name for test file --- Lib/test/{test_locale_coercion.py => test_c_locale_coercion.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Lib/test/{test_locale_coercion.py => test_c_locale_coercion.py} (100%) diff --git a/Lib/test/test_locale_coercion.py b/Lib/test/test_c_locale_coercion.py similarity index 100% rename from Lib/test/test_locale_coercion.py rename to Lib/test/test_c_locale_coercion.py From 4e6d5029601a74b12807f5e776964cd062c2194c Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 19:24:10 +1000 Subject: [PATCH 14/36] Check standard stream settings in locale coercion tests --- Lib/test/test_c_locale_coercion.py | 121 +++++++++++++++++++---------- 1 file changed, 80 insertions(+), 41 deletions(-) diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 27d7623a6f34b9..498e3dccd35fa4 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -6,6 +6,8 @@ import sysconfig import shutil import subprocess +from collections import namedtuple + import test.support from test.support.script_helper import ( run_python_until_end, @@ -37,65 +39,99 @@ def _set_locale_in_subprocess(locale_name, category): "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour)." ) -@test.support.cpython_only -@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), - "C locale coercion disabled at build time") -class LocaleOverrideTests(unittest.TestCase): +_EncodingDetails = namedtuple("EncodingDetails", + "fsencoding stdin_info stdout_info stderr_info") + +class EncodingDetails(_EncodingDetails): + CHILD_PROCESS_SCRIPT = ";".join([ + "import sys", + "print(sys.getfilesystemencoding())", + "print(sys.stdin.encoding + ':' + sys.stdin.errors)", + "print(sys.stdout.encoding + ':' + sys.stdout.errors)", + "print(sys.stderr.encoding + ':' + sys.stderr.errors)", + ]) @classmethod - def setUpClass(cls): - for target_locale, target_category, env_updates in _C_UTF8_LOCALES: - if _set_locale_in_subprocess(target_locale, target_category): - break - else: - raise unittest.SkipTest("No C-with-UTF-8 locale available") - cls.EXPECTED_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format( - env_updates, target_locale - ) + def get_expected_details(cls, expected_fsencoding): + """Returns expected child process details for a given encoding""" + _stream = expected_fsencoding + ":{}" + # stdin and stdout should use surrogateescape either because the + # coercion triggered, or because the C locale was detected + stream_info = 2*[_stream.format("surrogateescape")] + # stderr should always use backslashreplace + stream_info.append(_stream.format("backslashreplace")) + return dict(cls(expected_fsencoding, *stream_info)._asdict()) + + @staticmethod + def _replace_ascii_alias(data): + """ASCII may be reported as ANSI_X3.4-1968, so replace it in output""" + return data.replace(b"ANSI_X3.4-1968", b"ascii") - def _get_child_fsencoding(self, env_vars): - """Retrieves sys.getfilesystemencoding() from a child process + @classmethod + def get_child_details(cls, env_vars): + """Retrieves fsencoding and standard stream details from a child process - Returns (fsencoding, stderr_lines): + Returns (encoding_details, stderr_lines): - - fsencoding: a lowercase str value with the child's fsencoding + - encoding_details: EncodingDetails for eager decoding - stderr_lines: result of calling splitlines() on the stderr output The child is run in isolated mode if the current interpreter supports that. """ - cmd = "import sys; print(sys.getfilesystemencoding().lower())" result, py_cmd = run_python_until_end( - "-c", cmd, + "-c", cls.CHILD_PROCESS_SCRIPT, __isolated=True, **env_vars ) if not result.rc == 0: result.fail(py_cmd) # All subprocess outputs in this test case should be pure ASCII - child_fsencoding = result.out.decode("ascii").rstrip() - child_stderr_lines = result.err.decode("ascii").rstrip().splitlines() - return child_fsencoding, child_stderr_lines + adjusted_output = cls._replace_ascii_alias(result.out) + stdout_lines = adjusted_output.decode("ascii").rstrip().splitlines() + child_encoding_details = dict(cls(*stdout_lines)._asdict()) + stderr_lines = result.err.decode("ascii").rstrip().splitlines() + return child_encoding_details, stderr_lines - def test_C_utf8_locale(self): - # Ensure the C.UTF-8 locale is accepted entirely without complaint - base_var_dict = { - "LANG": "", - "LC_CTYPE": "", - "LC_ALL": "", - } - for env_var in base_var_dict: - with self.subTest(env_var=env_var): - var_dict = base_var_dict.copy() - var_dict[env_var] = "C.UTF-8" - fsencoding, stderr_lines = self._get_child_fsencoding(var_dict) - self.assertEqual(fsencoding, "utf-8") - self.assertFalse(stderr_lines) +@test.support.cpython_only +@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), + "C locale coercion disabled at build time") +class LocaleOverrideTests(unittest.TestCase): + + @classmethod + def setUpClass(cls): + for target_locale, target_category, env_updates in _C_UTF8_LOCALES: + if _set_locale_in_subprocess(target_locale, target_category): + break + else: + raise unittest.SkipTest("No C-with-UTF-8 locale available") + cls.EXPECTED_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format( + env_updates, target_locale + ) + + def _check_child_encoding_details(self, + env_vars, + expected_fsencoding, + expected_warning): + """Check the C locale handling for various configurations + + Parameters: + expected_fsencoding: the encoding the child is expected to report + allow_c_locale: setting to use for PYTHONALLOWCLOCALE + None: don't set the variable at all + str: the value set in the child's environment + """ + result = EncodingDetails.get_child_details(env_vars) + encoding_details, stderr_lines = result + self.assertEqual(encoding_details, + EncodingDetails.get_expected_details( + expected_fsencoding)) + self.assertEqual(stderr_lines, expected_warning) def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): - """Check the handling of the C locale for various configurations + """Check the C locale handling for various configurations Parameters: expected_fsencoding: the encoding the child is expected to report @@ -103,10 +139,13 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): None: don't set the variable at all str: the value set in the child's environment """ + + # Check for expected warning on stderr if C locale is coerced expected_warning = [] if coerce_c_locale != "0": - # Check C locale is coerced with a warning on stderr expected_warning.append(self.EXPECTED_COERCION_WARNING) + + self.maxDiff = None base_var_dict = { "LANG": "", "LC_CTYPE": "", @@ -121,9 +160,9 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): var_dict[env_var] = locale_to_set if coerce_c_locale is not None: var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale - fsencoding, stderr_lines = self._get_child_fsencoding(var_dict) - self.assertEqual(fsencoding, expected_fsencoding) - self.assertEqual(stderr_lines, expected_warning) + self._check_child_encoding_details(var_dict, + expected_fsencoding, + expected_warning) def test_test_PYTHONCOERCECLOCALE_not_set(self): From ccfc83f04a3b7d5ac46a82dc4836e2f7f46702ee Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 19:28:50 +1000 Subject: [PATCH 15/36] Use US spelling --- Lib/test/test_c_locale_coercion.py | 2 +- Programs/python.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 498e3dccd35fa4..0ff49cdc48b292 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -36,7 +36,7 @@ def _set_locale_in_subprocess(locale_name, category): # Details of the CLI warning emitted at runtime CLI_COERCION_WARNING_FMT = ( "Python detected LC_CTYPE=C: {} coerced to {} (set another locale " - "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour)." + "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." ) _EncodingDetails = namedtuple("EncodingDetails", diff --git a/Programs/python.c b/Programs/python.c index 6d96b9173cc510..15bfcb95ae7a37 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -31,7 +31,7 @@ wmain(int argc, wchar_t **argv) #ifdef PY_COERCE_C_LOCALE static const char *_C_LOCALE_COERCION_WARNING = "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale " - "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behaviour).\n"; + "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n"; typedef struct _CandidateLocale { const char *locale_name; From b173af3ac2713bde6ee38018f35043a8254f6466 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 20:07:43 +1000 Subject: [PATCH 16/36] Helper function to query PYTHONCOERCECLOCALE --- Programs/python.c | 6 ++++-- Python/pylifecycle.c | 33 ++++++++++++++++++++++++++------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/Programs/python.c b/Programs/python.c index 15bfcb95ae7a37..e8838a2fe0cdc3 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -29,6 +29,9 @@ wmain(int argc, wchar_t **argv) */ #ifdef PY_COERCE_C_LOCALE +/* Access private pylifecycle API to check PYTHONCOERCECLOCALE */ +extern int _Py_CLocaleCoercionIsExpected(void); + static const char *_C_LOCALE_COERCION_WARNING = "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale " "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n"; @@ -101,7 +104,6 @@ _coerce_default_locale_settings(const _LocaleCoercionTarget *target) void _handle_legacy_c_locale(void) { - const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); /* We ignore the Python -E and -I flags here, as we need to sort out * the locale settings *before* we try to do anything with the command * line arguments. For cross-platform debugging purposes, we also need @@ -109,7 +111,7 @@ _handle_legacy_c_locale(void) * isolated from their environment to use the legacy ASCII-centric C * locale. */ - if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { + if (_Py_CLocaleCoercionIsExpected()) { /* PYTHONCOERCECLOCALE is not set, or is not set to exactly "0" */ const _LocaleCoercionTarget *target = NULL; for (target = _TARGET_LOCALES; target->locale_name; target++) { diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 692c0e451fed74..bb6411b9bedcbc 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -167,6 +167,7 @@ Py_SetStandardStreamEncoding(const char *encoding, const char *errors) return 0; } + /* Global initializations. Can be undone by Py_FinalizeEx(). Don't call this twice without an intervening Py_FinalizeEx() call. When initializations fail, a fatal error is issued and the function does @@ -302,6 +303,30 @@ import_init(PyInterpreterState *interp, PyObject *sysmod) } +/* Private helper to check whether or not Python expects the C locale to be + * coerced to a UTF-8 based locale prior to calling Py_Initialize + * + * Returns 1 if C locale coercion is expected + * Returns 0 if locale coercion is not expected, either due to it being disabled + * at build time, or due to PYTHONCOERCECLOCALE=0 being set in the environment + * + * May be called prior to Py_Initialize and without holding the GIL. + */ +int +_Py_CLocaleCoercionIsExpected(void) +{ +#ifdef PY_COERCE_C_LOCALE + /* This may be called prior to Py_Initialize, so we don't call any other + * Python APIs, and we ignore the -E and -I flags + */ + const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); + if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { + return 1; + } +#endif + return 0; +} + #ifdef PY_WARN_ON_C_LOCALE static const char *_C_LOCALE_WARNING = "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " @@ -312,13 +337,7 @@ static const char *_C_LOCALE_WARNING = static void _emit_stderr_warning_for_c_locale(void) { - const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); - /* We don't emit a warning if locale coercion has been explicitly disabled. - * - * For consistency with the corresponding check in Programs/python.c - * we ignore the Python -E and -I flags here. - */ - if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { + if (_Py_CLocaleCoercionIsExpected()) { const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { fprintf(stderr, "%s", _C_LOCALE_WARNING); From d099a52cd860c816829b1d4e871e5f9b1b30a7d1 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 20:20:14 +1000 Subject: [PATCH 17/36] Fix ReST markup --- Doc/using/cmdline.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 0f99ee4d620431..11b85bc4976c31 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -728,12 +728,12 @@ conflict. following locales for the given locale categories before loading the interpreter runtime: - * ``C.UTF-8` (``LC_ALL``) - * ``C.utf8` (``LC_ALL``) - * ``UTF-8` (``LC_CTYPE``) + * ``C.UTF-8`` (``LC_ALL``) + * ``C.utf8`` (``LC_ALL``) + * ``UTF-8`` (``LC_CTYPE``) If setting one of these locale categories succeeds, then the matching - environment variables will be set (both ``LC_ALL` and ``LANG`` for the + environment variables will be set (both ``LC_ALL`` and ``LANG`` for the ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category), and (if :envvar:`PYTHONIOENCODING` is not explicitly set), the text encoding and error handling for Python's standard streams in the current process From 762a09b82d37067720a286d190eb2efeebeab97a Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Wed, 15 Mar 2017 21:25:13 +1000 Subject: [PATCH 18/36] Fix Py_DEBUG/Py_SetStandardStreamEncoding compatibility problem --- Programs/python.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Programs/python.c b/Programs/python.c index e8838a2fe0cdc3..1a6c9e145d8320 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -139,7 +139,11 @@ main(int argc, char **argv) char *oldloc; /* Force malloc() allocator to bootstrap Python */ +#ifdef Py_DEBUG + (void)_PyMem_SetupAllocators("malloc_debug"); +# else (void)_PyMem_SetupAllocators("malloc"); +# endif argv_copy = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); argv_copy2 = (wchar_t **)PyMem_RawMalloc(sizeof(wchar_t*) * (argc+1)); @@ -203,7 +207,11 @@ main(int argc, char **argv) /* Force again malloc() allocator to release memory blocks allocated before Py_Main() */ +#ifdef Py_DEBUG + (void)_PyMem_SetupAllocators("malloc_debug"); +# else (void)_PyMem_SetupAllocators("malloc"); +# endif for (i = 0; i < argc; i++) { PyMem_RawFree(argv_copy2[i]); From 820bfadcb282244654a22e83e37ee17067245ecb Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 18 Mar 2017 00:16:23 +1000 Subject: [PATCH 19/36] Restore Windows _testembed compatibility Windows doesn't use setenv to set environment variables, so set PYTHONIOENCODING from test_capi instead of _testembed when running the forced_io_encoding test. --- Lib/test/test_capi.py | 9 +++++---- Programs/_testembed.c | 3 --- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py index ece84af5daa30a..391ca15750ce46 100644 --- a/Lib/test/test_capi.py +++ b/Lib/test/test_capi.py @@ -369,14 +369,15 @@ def setUp(self): def tearDown(self): os.chdir(self.oldcwd) - def run_embedded_interpreter(self, *args): + def run_embedded_interpreter(self, *args, env=None): """Runs a test in the embedded interpreter""" cmd = [self.test_exe] cmd.extend(args) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, - universal_newlines=True) + universal_newlines=True, + env=env) (out, err) = p.communicate() self.assertEqual(p.returncode, 0, "bad returncode %d, stderr is %r" % @@ -403,7 +404,8 @@ def _get_default_pipe_encoding(): def test_forced_io_encoding(self): # Checks forced configuration of embedded interpreter IO streams - out, err = self.run_embedded_interpreter("forced_io_encoding") + env = {"PYTHONIOENCODING": "UTF-8:surrogateescape"} + out, err = self.run_embedded_interpreter("forced_io_encoding", env=env) if support.verbose > 1: print() print(out) @@ -412,7 +414,6 @@ def test_forced_io_encoding(self): expected_stdin_encoding = "UTF-8" expected_pipe_encoding = self._get_default_pipe_encoding() expected_output = '\n'.join([ - "Setting PYTHONIOENCODING=UTF-8:surrogateescape", "--- Use defaults ---", "Expected encoding: default", "Expected errors: default", diff --git a/Programs/_testembed.c b/Programs/_testembed.c index 1494452dd7f61e..e28de1c7fb1c4a 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -107,9 +107,6 @@ static void check_stdio_details(const char *encoding, const char * errors) static int test_forced_io_encoding(void) { - /* Ensure consistent "defaults" */ - printf("Setting PYTHONIOENCODING=UTF-8:surrogateescape\n"); - setenv("PYTHONIOENCODING", "UTF-8:surrogateescape", 1); /* Check various combinations */ printf("--- Use defaults ---\n"); check_stdio_details(NULL, NULL); From 188e7807b6d9e49377aacbb287c074e5cabf70c5 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 6 May 2017 22:18:20 +1000 Subject: [PATCH 20/36] Update to latest version of PEP 538 - move all required logic inside the shared library - explicitly setting one of the coercion target locales now also automatically enables "surrogateescape" on sys.stdin and sys.stdout --- Doc/using/cmdline.rst | 11 ++- Lib/test/test_c_locale_coercion.py | 57 +++++++++-- Programs/python.c | 114 +-------------------- Python/pylifecycle.c | 154 +++++++++++++++++++++++++---- 4 files changed, 196 insertions(+), 140 deletions(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 11b85bc4976c31..c6ec1470db4eaf 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -734,10 +734,13 @@ conflict. If setting one of these locale categories succeeds, then the matching environment variables will be set (both ``LC_ALL`` and ``LANG`` for the - ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category), - and (if :envvar:`PYTHONIOENCODING` is not explicitly set), the text encoding - and error handling for Python's standard streams in the current process - will be set to ``utf-8`` and ``surrogateescape`` respectively. + ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category) in + the current process environment before the Python runtime is initialized. + + Configuring one of these locales (either explicitly or via the above + implicit locale coercion) will automatically set the error handler for + :data:`sys.stdin` and :data:`sys.stdout` to ``surrogateescape``. This + behavior can be overridden using :envvar:`PYTHONIOENCODING` as usual. Availability: \*nix diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 0ff49cdc48b292..4ac0cb00d5a2ec 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -15,7 +15,7 @@ ) # In order to get the warning messages to match up as expected, the candidate -# order here must much the target locale order in Programs/python.c +# order here must much the target locale order in Python/pylifecycle.c _C_UTF8_LOCALES = ( # Entries: (Target locale, target category, expected env var updates) ("C.UTF-8", "LC_ALL", "LC_ALL & LANG"), @@ -63,9 +63,13 @@ def get_expected_details(cls, expected_fsencoding): return dict(cls(expected_fsencoding, *stream_info)._asdict()) @staticmethod - def _replace_ascii_alias(data): - """ASCII may be reported as ANSI_X3.4-1968, so replace it in output""" - return data.replace(b"ANSI_X3.4-1968", b"ascii") + def _handle_output_variations(data): + """Adjust the output to handle platform specific idiosyncrasies + + * Some platforms report ASCII as ANSI_X3.4-1968 + * Some platforms report UTF-8 instead of utf-8 + """ + return data.replace(b"ANSI_X3.4-1968", b"ascii").lower() @classmethod def get_child_details(cls, env_vars): @@ -87,7 +91,7 @@ def get_child_details(cls, env_vars): if not result.rc == 0: result.fail(py_cmd) # All subprocess outputs in this test case should be pure ASCII - adjusted_output = cls._replace_ascii_alias(result.out) + adjusted_output = cls._handle_output_variations(result.out) stdout_lines = adjusted_output.decode("ascii").rstrip().splitlines() child_encoding_details = dict(cls(*stdout_lines)._asdict()) stderr_lines = result.err.decode("ascii").rstrip().splitlines() @@ -99,15 +103,24 @@ def get_child_details(cls, env_vars): "C locale coercion disabled at build time") class LocaleOverrideTests(unittest.TestCase): + available_targets = [] + @classmethod def setUpClass(cls): + first_target_locale = first_env_updates = None + available_targets = cls.available_targets + # Find the target locales available in the current system for target_locale, target_category, env_updates in _C_UTF8_LOCALES: if _set_locale_in_subprocess(target_locale, target_category): - break - else: + available_targets.append(target_locale) + if first_target_locale is None: + first_target_locale = target_locale + first_env_updates = env_updates + if not available_targets: raise unittest.SkipTest("No C-with-UTF-8 locale available") + # Expect coercion to use the first available locale cls.EXPECTED_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format( - env_updates, target_locale + first_env_updates, first_target_locale ) def _check_child_encoding_details(self, @@ -141,11 +154,12 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): """ # Check for expected warning on stderr if C locale is coerced + self.maxDiff = None + expected_warning = [] if coerce_c_locale != "0": expected_warning.append(self.EXPECTED_COERCION_WARNING) - self.maxDiff = None base_var_dict = { "LANG": "", "LC_CTYPE": "", @@ -166,7 +180,7 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): def test_test_PYTHONCOERCECLOCALE_not_set(self): - # This should coerce to the C.UTF-8 locale by default + # This should coerce to the first available target locale by default self._check_c_locale_coercion("utf-8", coerce_c_locale=None) def test_PYTHONCOERCECLOCALE_not_zero(self): @@ -179,6 +193,29 @@ def test_PYTHONCOERCECLOCALE_set_to_zero(self): # The setting "0" should result in the locale coercion being disabled self._check_c_locale_coercion("ascii", coerce_c_locale="0") + def test_external_target_locale_configuration(self): + # Explicitly setting a target locale should give the same behaviour as + # is seen when implicitly coercing to that target locale + self.maxDiff = None + + expected_warning = [] + expected_fsencoding = "utf-8" + + base_var_dict = { + "LANG": "", + "LC_CTYPE": "", + "LC_ALL": "", + } + for env_var in base_var_dict: + for locale_to_set in self.available_targets: + with self.subTest(env_var=env_var, + configured_locale=locale_to_set): + var_dict = base_var_dict.copy() + var_dict[env_var] = locale_to_set + self._check_child_encoding_details(var_dict, + expected_fsencoding, + expected_warning) + # Details of the shared library warning emitted at runtime LIBRARY_C_LOCALE_WARNING = ( diff --git a/Programs/python.c b/Programs/python.c index 1a6c9e145d8320..03f8295045cfc6 100644 --- a/Programs/python.c +++ b/Programs/python.c @@ -15,7 +15,7 @@ wmain(int argc, wchar_t **argv) } #else -/* Helpers to better handle the legacy C locale +/* Access private pylifecycle helper API to better handle the legacy C locale * * The legacy C locale assumes ASCII as the default text encoding, which * causes problems not only for the CPython runtime, but also other @@ -27,107 +27,8 @@ wmain(int argc, wchar_t **argv) * See the documentation of the PYTHONCOERCECLOCALE setting for more details. * */ - -#ifdef PY_COERCE_C_LOCALE -/* Access private pylifecycle API to check PYTHONCOERCECLOCALE */ -extern int _Py_CLocaleCoercionIsExpected(void); - -static const char *_C_LOCALE_COERCION_WARNING = - "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale " - "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n"; - -typedef struct _CandidateLocale { - const char *locale_name; - int category; -} _LocaleCoercionTarget; - -static _LocaleCoercionTarget _TARGET_LOCALES[] = { - { "C.UTF-8", LC_ALL }, - { "C.utf8", LC_ALL }, - { "UTF-8", LC_CTYPE }, - { NULL, 0 } -}; - -void -_coerce_default_locale_settings(const _LocaleCoercionTarget *target) -{ - const char *newloc = target->locale_name; - int category = target->category; - - /* Reset locale back to currently configured defaults */ - setlocale(LC_ALL, ""); - - /* Set the relevant locale environment variables */ - if (category == LC_ALL) { - const char *env_vars_updated = "LC_ALL & LANG"; - if (setenv("LC_ALL", newloc, 1)) { - fprintf(stderr, - "Error setting LC_ALL, skipping C locale coercion\n"); - return; - } - if (setenv("LANG", newloc, 1)) { - fprintf(stderr, - "Error setting LANG during C locale coercion\n"); - env_vars_updated = "LC_ALL"; - } - fprintf(stderr, _C_LOCALE_COERCION_WARNING, env_vars_updated, newloc); - } else if (category == LC_CTYPE) { - if (setenv("LC_CTYPE", newloc, 1)) { - fprintf(stderr, - "Error setting LC_CTYPE, skipping C locale coercion\n"); - return; - } - fprintf(stderr, _C_LOCALE_COERCION_WARNING, "LC_CTYPE", newloc); - } else { - fprintf(stderr, "Locale coercion must target LC_ALL or LC_CTYPE\n"); - return; - } - - /* Set standard stream encoding if PYTHONIOENCODING is not set - * - * We avoid setting PYTHONIOENCODING, as that can confuse Python 2 - * instances in subprocesses that inherit the environment (as Python - * 2 has no 'surrogateescape' error handler). - * - * If PEP 540 is also implemented, this check will be replaced with - * unconditionally setting PYTHONUTF8=1 - */ - const char *io_encoding = getenv("PYTHONIOENCODING"); - if ((io_encoding == NULL) || (strnlen(io_encoding, 1) == 0)) { - Py_SetStandardStreamEncoding("utf-8", "surrogateescape"); - } - - /* Reconfigure with the overridden environment variables */ - setlocale(LC_ALL, ""); -} - -void -_handle_legacy_c_locale(void) -{ - /* We ignore the Python -E and -I flags here, as we need to sort out - * the locale settings *before* we try to do anything with the command - * line arguments. For cross-platform debugging purposes, we also need - * to give end users a way to force even scripts that are otherwise - * isolated from their environment to use the legacy ASCII-centric C - * locale. - */ - if (_Py_CLocaleCoercionIsExpected()) { - /* PYTHONCOERCECLOCALE is not set, or is not set to exactly "0" */ - const _LocaleCoercionTarget *target = NULL; - for (target = _TARGET_LOCALES; target->locale_name; target++) { - const char *reconfigured_locale = setlocale(target->category, - target->locale_name); - if (reconfigured_locale != NULL) { - /* Successfully configured locale, so make it the default */ - _coerce_default_locale_settings(target); - return; - } - } - - } - /* No C locale warning here, as Py_Initialize will emit one later */ -} -#endif +extern int _Py_LegacyLocaleDetected(void); +extern void _Py_CoerceLegacyLocale(void); int main(int argc, char **argv) @@ -177,14 +78,9 @@ main(int argc, char **argv) setlocale(LC_ALL, ""); #endif -#ifdef PY_COERCE_C_LOCALE - /* When the LC_CTYPE category still claims to be using the C locale, - assume configuration error and try for a UTF-8 based locale instead */ - const char *ctype_loc = setlocale(LC_CTYPE, NULL); - if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { - _handle_legacy_c_locale(); + if (_Py_LegacyLocaleDetected()) { + _Py_CoerceLegacyLocale(); } -#endif /* Convert from char to wchar_t based on the locale settings */ for (i = 0; i < argc; i++) { diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index aa10a01f67ad26..ee11b5fdc96ed3 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -303,19 +303,115 @@ import_init(PyInterpreterState *interp, PyObject *sysmod) } -/* Private helper to check whether or not Python expects the C locale to be - * coerced to a UTF-8 based locale prior to calling Py_Initialize +/* Helper functions to better handle the legacy C locale * - * Returns 1 if C locale coercion is expected - * Returns 0 if locale coercion is not expected, either due to it being disabled - * at build time, or due to PYTHONCOERCECLOCALE=0 being set in the environment + * The legacy C locale assumes ASCII as the default text encoding, which + * causes problems not only for the CPython runtime, but also other + * components like GNU readline. * - * May be called prior to Py_Initialize and without holding the GIL. + * Accordingly, when the CLI detects it, it attempts to coerce it to a + * more capable UTF-8 based alternative as follows: + * + * if (_Py_LegacyLocaleDetected()) { + * _Py_CoerceLegacyLocale(); + * } + * + * See the documentation of the PYTHONCOERCECLOCALE setting for more details. + * + * Locale coercion also impacts the default error handler for the standard + * streams: while the usual default is "strict", the default for the legacy + * C locale and for any of the coercion target locales is "surrogateescape". */ + int -_Py_CLocaleCoercionIsExpected(void) +_Py_LegacyLocaleDetected(void) { + const char *ctype_loc = setlocale(LC_CTYPE, NULL); + return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0; +} + +typedef struct _CandidateLocale { + const char *locale_name; + int category; +} _LocaleCoercionTarget; + +static _LocaleCoercionTarget _TARGET_LOCALES[] = { + { "C.UTF-8", LC_ALL }, + { "C.utf8", LC_ALL }, + { "UTF-8", LC_CTYPE }, + { NULL, 0 } +}; + +static char * +get_default_standard_stream_error_handler(void) +{ + const char *ctype_loc = setlocale(LC_CTYPE, NULL); + if (ctype_loc != NULL) { + /* "surrogateescape" is the default in the legacy C locale */ + if (strcmp(ctype_loc, "C") == 0) { + return "surrogateescape"; + } + + /* "surrogateescape" is the default in locale coercion target locales */ + const _LocaleCoercionTarget *target = NULL; + for (target = _TARGET_LOCALES; target->locale_name; target++) { + if (strcmp(ctype_loc, target->locale_name) == 0) { + return "surrogateescape"; + } + } + } + + /* Otherwise return NULL to request the typical default error handler */ + return NULL; +} + #ifdef PY_COERCE_C_LOCALE +static const char *_C_LOCALE_COERCION_WARNING = + "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale " + "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n"; + +static void +_coerce_default_locale_settings(const _LocaleCoercionTarget *target) +{ + const char *newloc = target->locale_name; + int category = target->category; + + /* Reset locale back to currently configured defaults */ + setlocale(LC_ALL, ""); + + /* Set the relevant locale environment variables */ + if (category == LC_ALL) { + const char *env_vars_updated = "LC_ALL & LANG"; + if (setenv("LC_ALL", newloc, 1)) { + fprintf(stderr, + "Error setting LC_ALL, skipping C locale coercion\n"); + return; + } + if (setenv("LANG", newloc, 1)) { + fprintf(stderr, + "Error setting LANG during C locale coercion\n"); + env_vars_updated = "LC_ALL"; + } + fprintf(stderr, _C_LOCALE_COERCION_WARNING, env_vars_updated, newloc); + } else if (category == LC_CTYPE) { + if (setenv("LC_CTYPE", newloc, 1)) { + fprintf(stderr, + "Error setting LC_CTYPE, skipping C locale coercion\n"); + return; + } + fprintf(stderr, _C_LOCALE_COERCION_WARNING, "LC_CTYPE", newloc); + } else { + fprintf(stderr, "Locale coercion must target LC_ALL or LC_CTYPE\n"); + return; + } + + /* Reconfigure with the overridden environment variables */ + setlocale(LC_ALL, ""); +} + +static int +c_locale_coercion_is_expected(void) +{ /* This may be called prior to Py_Initialize, so we don't call any other * Python APIs, and we ignore the -E and -I flags */ @@ -323,9 +419,38 @@ _Py_CLocaleCoercionIsExpected(void) if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { return 1; } -#endif return 0; } +#endif + +void +_Py_CoerceLegacyLocale(void) +{ +#ifdef PY_COERCE_C_LOCALE + /* We ignore the Python -E and -I flags here, as the CLI needs to sort out + * the locale settings *before* we try to do anything with the command + * line arguments. For cross-platform debugging purposes, we also need + * to give end users a way to force even scripts that are otherwise + * isolated from their environment to use the legacy ASCII-centric C + * locale. + */ + if (c_locale_coercion_is_expected()) { + /* PYTHONCOERCECLOCALE is not set, or is not set to exactly "0" */ + const _LocaleCoercionTarget *target = NULL; + for (target = _TARGET_LOCALES; target->locale_name; target++) { + const char *reconfigured_locale = setlocale(target->category, + target->locale_name); + if (reconfigured_locale != NULL) { + /* Successfully configured locale, so make it the default */ + _coerce_default_locale_settings(target); + return; + } + } + } + /* No C locale warning here, as Py_Initialize will emit one later */ +#endif +} + #ifdef PY_WARN_ON_C_LOCALE static const char *_C_LOCALE_WARNING = @@ -337,9 +462,8 @@ static const char *_C_LOCALE_WARNING = static void _emit_stderr_warning_for_c_locale(void) { - if (_Py_CLocaleCoercionIsExpected()) { - const char *ctype_loc = setlocale(LC_CTYPE, NULL); - if (ctype_loc != NULL && strcmp(ctype_loc, "C") == 0) { + if (c_locale_coercion_is_expected()) { + if (_Py_LegacyLocaleDetected()) { fprintf(stderr, "%s", _C_LOCALE_WARNING); } } @@ -1304,12 +1428,8 @@ initstdio(void) } } if (!errors && !(pythonioencoding && *pythonioencoding)) { - /* When the LC_CTYPE locale is the POSIX locale ("C locale"), - stdin and stdout use the surrogateescape error handler by - default, instead of the strict error handler. */ - char *loc = setlocale(LC_CTYPE, NULL); - if (loc != NULL && strcmp(loc, "C") == 0) - errors = "surrogateescape"; + /* Choose the default error handler based on the current locale */ + errors = get_default_standard_stream_error_handler(); } } From 476a78133c94d82e19b89f50036cecd9b4214e7a Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Tue, 9 May 2017 17:05:59 +1000 Subject: [PATCH 21/36] Change locale coercion to always respect LC_ALL Locale coercion no longer has any effect if LC_ALL is explicitly set in the environment. When locale coercion triggers, it sets either both LC_CTYPE & LANG (for full locales) or only LC_CTYPE (for partial locales). This change also eliminated the need for a custom test case for the locale coercion warning - instead, the test suite is able to check for that just by setting LC_ALL in the child process environment. --- Lib/test/test_c_locale_coercion.py | 219 ++++++++++++++--------------- Programs/_testembed.c | 15 -- Python/pylifecycle.c | 62 ++++---- 3 files changed, 136 insertions(+), 160 deletions(-) diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index 4ac0cb00d5a2ec..ad6ecac422e8dd 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -17,28 +17,22 @@ # In order to get the warning messages to match up as expected, the candidate # order here must much the target locale order in Python/pylifecycle.c _C_UTF8_LOCALES = ( - # Entries: (Target locale, target category, expected env var updates) - ("C.UTF-8", "LC_ALL", "LC_ALL & LANG"), - ("C.utf8", "LC_ALL", "LC_ALL & LANG"), - ("UTF-8", "LC_CTYPE", "LC_CTYPE"), + # Entries: (Target locale, expected env var updates) + ("C.UTF-8", "LC_CTYPE & LANG"), + ("C.utf8", "LC_CTYPE & LANG"), + ("UTF-8", "LC_CTYPE"), ) # There's no reliable cross-platform way of checking locale alias # lists, so the only way of knowing which of these locales will work # is to try them with locale.setlocale(). We do that in a subprocess # to avoid altering the locale of the test runner. -def _set_locale_in_subprocess(locale_name, category): - cmd_fmt = "import locale; print(locale.setlocale(locale.{}, '{}'))" - cmd = cmd_fmt.format(category, locale_name) +def _set_locale_in_subprocess(locale_name): + cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))" + cmd = cmd_fmt.format(locale_name) result, py_cmd = run_python_until_end("-c", cmd, __isolated=True) return result.rc == 0 -# Details of the CLI warning emitted at runtime -CLI_COERCION_WARNING_FMT = ( - "Python detected LC_CTYPE=C: {} coerced to {} (set another locale " - "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." -) - _EncodingDetails = namedtuple("EncodingDetails", "fsencoding stdin_info stdout_info stderr_info") @@ -67,9 +61,13 @@ def _handle_output_variations(data): """Adjust the output to handle platform specific idiosyncrasies * Some platforms report ASCII as ANSI_X3.4-1968 + * Some platforms report ASCII as US-ASCII * Some platforms report UTF-8 instead of utf-8 """ - return data.replace(b"ANSI_X3.4-1968", b"ascii").lower() + data = data.replace(b"ANSI_X3.4-1968", b"ascii") + data = data.replace(b"US-ASCII", b"ascii") + data = data.lower() + return data @classmethod def get_child_details(cls, env_vars): @@ -98,50 +96,116 @@ def get_child_details(cls, env_vars): return child_encoding_details, stderr_lines -@test.support.cpython_only -@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), - "C locale coercion disabled at build time") -class LocaleOverrideTests(unittest.TestCase): +class _ChildProcessEncodingTestCase(unittest.TestCase): + # Base class to check for expected encoding details in a child process + + def _check_child_encoding_details(self, + env_vars, + expected_fsencoding, + expected_warning): + """Check the C locale handling for the given process environment + + Parameters: + expected_fsencoding: the encoding the child is expected to report + allow_c_locale: setting to use for PYTHONALLOWCLOCALE + None: don't set the variable at all + str: the value set in the child's environment + """ + result = EncodingDetails.get_child_details(env_vars) + encoding_details, stderr_lines = result + self.assertEqual(encoding_details, + EncodingDetails.get_expected_details( + expected_fsencoding)) + self.assertEqual(stderr_lines, expected_warning) + +# Details of the shared library warning emitted at runtime +LIBRARY_C_LOCALE_WARNING = ( + "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " + "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " + "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " + "locales is recommended." +) + +@unittest.skipUnless(sysconfig.get_config_var("PY_WARN_ON_C_LOCALE"), + "C locale runtime warning disabled at build time") +class LocaleWarningTests(_ChildProcessEncodingTestCase): + # Test warning emitted when running in the C locale + + def test_library_c_locale_warning(self): + self.maxDiff = None + for locale_to_set in ("C", "POSIX", "invalid.ascii"): + var_dict = { + "LC_ALL": locale_to_set + } + with self.subTest(forced_locale=locale_to_set): + self._check_child_encoding_details(var_dict, + "ascii", + [LIBRARY_C_LOCALE_WARNING]) + +# Details of the CLI locale coercion warning emitted at runtime +CLI_COERCION_WARNING_FMT = ( + "Python detected LC_CTYPE=C: {} coerced to {} (set another locale " + "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." +) + +class _LocaleCoercionTargetsTestCase(_ChildProcessEncodingTestCase): + # Base class for test cases that rely on coercion targets being defined available_targets = [] + targets_required = True @classmethod def setUpClass(cls): first_target_locale = first_env_updates = None available_targets = cls.available_targets # Find the target locales available in the current system - for target_locale, target_category, env_updates in _C_UTF8_LOCALES: - if _set_locale_in_subprocess(target_locale, target_category): + for target_locale, env_updates in _C_UTF8_LOCALES: + if _set_locale_in_subprocess(target_locale): available_targets.append(target_locale) if first_target_locale is None: first_target_locale = target_locale first_env_updates = env_updates - if not available_targets: + if cls.targets_required and not available_targets: raise unittest.SkipTest("No C-with-UTF-8 locale available") # Expect coercion to use the first available locale cls.EXPECTED_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format( first_env_updates, first_target_locale ) - def _check_child_encoding_details(self, - env_vars, - expected_fsencoding, - expected_warning): - """Check the C locale handling for various configurations - Parameters: - expected_fsencoding: the encoding the child is expected to report - allow_c_locale: setting to use for PYTHONALLOWCLOCALE - None: don't set the variable at all - str: the value set in the child's environment - """ - result = EncodingDetails.get_child_details(env_vars) - encoding_details, stderr_lines = result - self.assertEqual(encoding_details, - EncodingDetails.get_expected_details( - expected_fsencoding)) - self.assertEqual(stderr_lines, expected_warning) +class LocaleConfigurationTests(_LocaleCoercionTargetsTestCase): + # Test explicit external configuration via the process environment + def test_external_target_locale_configuration(self): + # Explicitly setting a target locale should give the same behaviour as + # is seen when implicitly coercing to that target locale + self.maxDiff = None + + expected_warning = [] + expected_fsencoding = "utf-8" + + base_var_dict = { + "LANG": "", + "LC_CTYPE": "", + "LC_ALL": "", + } + for env_var in ("LANG", "LC_CTYPE"): + for locale_to_set in self.available_targets: + with self.subTest(env_var=env_var, + configured_locale=locale_to_set): + var_dict = base_var_dict.copy() + var_dict[env_var] = locale_to_set + self._check_child_encoding_details(var_dict, + expected_fsencoding, + expected_warning) + + + +@test.support.cpython_only +@unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"), + "C locale coercion disabled at build time") +class LocaleCoercionTests(_LocaleCoercionTargetsTestCase): + # Test implicit reconfiguration of the environment during CLI startup def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): """Check the C locale handling for various configurations @@ -165,7 +229,7 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): "LC_CTYPE": "", "LC_ALL": "", } - for env_var in base_var_dict: + for env_var in ("LANG", "LC_CTYPE"): for locale_to_set in ("", "C", "POSIX", "invalid.ascii"): with self.subTest(env_var=env_var, nominal_locale=locale_to_set, @@ -178,7 +242,6 @@ def _check_c_locale_coercion(self, expected_fsencoding, coerce_c_locale): expected_fsencoding, expected_warning) - def test_test_PYTHONCOERCECLOCALE_not_set(self): # This should coerce to the first available target locale by default self._check_c_locale_coercion("utf-8", coerce_c_locale=None) @@ -193,79 +256,13 @@ def test_PYTHONCOERCECLOCALE_set_to_zero(self): # The setting "0" should result in the locale coercion being disabled self._check_c_locale_coercion("ascii", coerce_c_locale="0") - def test_external_target_locale_configuration(self): - # Explicitly setting a target locale should give the same behaviour as - # is seen when implicitly coercing to that target locale - self.maxDiff = None - - expected_warning = [] - expected_fsencoding = "utf-8" - - base_var_dict = { - "LANG": "", - "LC_CTYPE": "", - "LC_ALL": "", - } - for env_var in base_var_dict: - for locale_to_set in self.available_targets: - with self.subTest(env_var=env_var, - configured_locale=locale_to_set): - var_dict = base_var_dict.copy() - var_dict[env_var] = locale_to_set - self._check_child_encoding_details(var_dict, - expected_fsencoding, - expected_warning) - - -# Details of the shared library warning emitted at runtime -LIBRARY_C_LOCALE_WARNING = ( - "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII " - "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, " - "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible " - "locales is recommended.\n" -) - -@unittest.skipUnless(sysconfig.get_config_var("PY_WARN_ON_C_LOCALE"), - "C locale runtime warning disabled at build time") -class EmbeddingTests(unittest.TestCase): - def setUp(self): - here = os.path.abspath(__file__) - basepath = os.path.dirname(os.path.dirname(os.path.dirname(here))) - self.test_exe = exe = os.path.join(basepath, "Programs", "_testembed") - if not os.path.exists(exe): - self.skipTest("%r doesn't exist" % exe) - # This is needed otherwise we get a fatal error: - # "Py_Initialize: Unable to get the locale encoding - # LookupError: no codec search functions registered: can't find encoding" - self.addCleanup(os.chdir, os.getcwd()) - os.chdir(basepath) - - def run_embedded_interpreter(self, *args): - """Runs a test in the embedded interpreter""" - cmd = [self.test_exe] - cmd.extend(args) - p = subprocess.Popen(cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - universal_newlines=True) - (out, err) = p.communicate() - self.assertEqual(p.returncode, 0, - "bad returncode %d, stderr is %r" % - (p.returncode, err)) - return out, err - - def test_library_c_locale_warning(self): - # Checks forced configuration of embedded interpreter IO streams - out, err = self.run_embedded_interpreter("c_locale_warning") - if test.support.verbose > 1: - print() - print(out) - print(err) - self.assertEqual(out, "") - self.assertEqual(err, LIBRARY_C_LOCALE_WARNING) def test_main(): - test.support.run_unittest(LocaleOverrideTests, EmbeddingTests) + test.support.run_unittest( + LocaleConfigurationTests, + LocaleCoercionTests, + LocaleWarningTests + ) test.support.reap_children() if __name__ == "__main__": diff --git a/Programs/_testembed.c b/Programs/_testembed.c index e28de1c7fb1c4a..280bf501e34699 100644 --- a/Programs/_testembed.c +++ b/Programs/_testembed.c @@ -127,20 +127,6 @@ static int test_forced_io_encoding(void) return 0; } -static int test_c_locale_warning(void) -{ -#ifdef PY_WARN_ON_C_LOCALE - /* Force use of the C locale */ - setenv("LC_ALL", "C", 1); - - _testembed_Py_Initialize(); - Py_Finalize(); -#else - printf("C locale compatibility warning disabled at compile time\n"); -#endif - return 0; -} - /* ********************************************************* * List of test cases and the function that implements it. * @@ -162,7 +148,6 @@ struct TestCase static struct TestCase TestCases[] = { { "forced_io_encoding", test_forced_io_encoding }, { "repeated_init_and_subinterpreters", test_repeated_init_and_subinterpreters }, - { "c_locale_warning", test_c_locale_warning }, { NULL, NULL } }; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index ee11b5fdc96ed3..278a5afba3c586 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -331,14 +331,14 @@ _Py_LegacyLocaleDetected(void) } typedef struct _CandidateLocale { - const char *locale_name; - int category; + const char *locale_name; /* The locale to try as a coercion target */ + int set_LANG; /* Whether to set LANG in addition to LC_CTYPE */ } _LocaleCoercionTarget; static _LocaleCoercionTarget _TARGET_LOCALES[] = { - { "C.UTF-8", LC_ALL }, - { "C.utf8", LC_ALL }, - { "UTF-8", LC_CTYPE }, + { "C.UTF-8", 1 }, + { "C.utf8", 1}, + { "UTF-8", 0 }, { NULL, 0 } }; @@ -373,37 +373,27 @@ static const char *_C_LOCALE_COERCION_WARNING = static void _coerce_default_locale_settings(const _LocaleCoercionTarget *target) { + const char *env_vars_updated = "LC_CTYPE"; const char *newloc = target->locale_name; - int category = target->category; /* Reset locale back to currently configured defaults */ setlocale(LC_ALL, ""); /* Set the relevant locale environment variables */ - if (category == LC_ALL) { - const char *env_vars_updated = "LC_ALL & LANG"; - if (setenv("LC_ALL", newloc, 1)) { - fprintf(stderr, - "Error setting LC_ALL, skipping C locale coercion\n"); - return; - } - if (setenv("LANG", newloc, 1)) { + if (setenv("LC_CTYPE", newloc, 1)) { + fprintf(stderr, + "Error setting LC_CTYPE, skipping C locale coercion\n"); + return; + } + if (target->set_LANG) { + if (setenv("LANG", newloc, 1) == 0) { + env_vars_updated = "LC_CTYPE & LANG"; + } else { fprintf(stderr, "Error setting LANG during C locale coercion\n"); - env_vars_updated = "LC_ALL"; - } - fprintf(stderr, _C_LOCALE_COERCION_WARNING, env_vars_updated, newloc); - } else if (category == LC_CTYPE) { - if (setenv("LC_CTYPE", newloc, 1)) { - fprintf(stderr, - "Error setting LC_CTYPE, skipping C locale coercion\n"); - return; } - fprintf(stderr, _C_LOCALE_COERCION_WARNING, "LC_CTYPE", newloc); - } else { - fprintf(stderr, "Locale coercion must target LC_ALL or LC_CTYPE\n"); - return; } + fprintf(stderr, _C_LOCALE_COERCION_WARNING, env_vars_updated, newloc); /* Reconfigure with the overridden environment variables */ setlocale(LC_ALL, ""); @@ -436,14 +426,18 @@ _Py_CoerceLegacyLocale(void) */ if (c_locale_coercion_is_expected()) { /* PYTHONCOERCECLOCALE is not set, or is not set to exactly "0" */ - const _LocaleCoercionTarget *target = NULL; - for (target = _TARGET_LOCALES; target->locale_name; target++) { - const char *reconfigured_locale = setlocale(target->category, - target->locale_name); - if (reconfigured_locale != NULL) { - /* Successfully configured locale, so make it the default */ - _coerce_default_locale_settings(target); - return; + const char *locale_override = getenv("LC_ALL"); + if (locale_override == NULL || *locale_override == '\0') { + /* LC_ALL is also not set (or is set to an empty string) */ + const _LocaleCoercionTarget *target = NULL; + for (target = _TARGET_LOCALES; target->locale_name; target++) { + const char *new_locale = setlocale(LC_CTYPE, + target->locale_name); + if (new_locale != NULL) { + /* Successfully configured locale, so make it the default */ + _coerce_default_locale_settings(target); + return; + } } } } From 939ba0a77d4b52a04315c129f9db89b90c0532cd Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 27 May 2017 16:37:52 +1000 Subject: [PATCH 22/36] Don't set LANG during locale coercion It turns out setting onlyLC_CTYPE is sufficient, which improves cross-platform consistency and reduces the chance of unintended side effects. --- Lib/test/test_c_locale_coercion.py | 19 ++++++------------- Python/pylifecycle.c | 24 +++++++----------------- 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/Lib/test/test_c_locale_coercion.py b/Lib/test/test_c_locale_coercion.py index ad6ecac422e8dd..c14d820a2d7931 100644 --- a/Lib/test/test_c_locale_coercion.py +++ b/Lib/test/test_c_locale_coercion.py @@ -16,12 +16,7 @@ # In order to get the warning messages to match up as expected, the candidate # order here must much the target locale order in Python/pylifecycle.c -_C_UTF8_LOCALES = ( - # Entries: (Target locale, expected env var updates) - ("C.UTF-8", "LC_CTYPE & LANG"), - ("C.utf8", "LC_CTYPE & LANG"), - ("UTF-8", "LC_CTYPE"), -) +_C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8") # There's no reliable cross-platform way of checking locale alias # lists, so the only way of knowing which of these locales will work @@ -144,7 +139,7 @@ def test_library_c_locale_warning(self): # Details of the CLI locale coercion warning emitted at runtime CLI_COERCION_WARNING_FMT = ( - "Python detected LC_CTYPE=C: {} coerced to {} (set another locale " + "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale " "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)." ) @@ -156,21 +151,19 @@ class _LocaleCoercionTargetsTestCase(_ChildProcessEncodingTestCase): @classmethod def setUpClass(cls): - first_target_locale = first_env_updates = None + first_target_locale = None available_targets = cls.available_targets # Find the target locales available in the current system - for target_locale, env_updates in _C_UTF8_LOCALES: + for target_locale in _C_UTF8_LOCALES: if _set_locale_in_subprocess(target_locale): available_targets.append(target_locale) if first_target_locale is None: first_target_locale = target_locale - first_env_updates = env_updates if cls.targets_required and not available_targets: raise unittest.SkipTest("No C-with-UTF-8 locale available") # Expect coercion to use the first available locale - cls.EXPECTED_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format( - first_env_updates, first_target_locale - ) + warning_msg = CLI_COERCION_WARNING_FMT.format(first_target_locale) + cls.EXPECTED_COERCION_WARNING = warning_msg class LocaleConfigurationTests(_LocaleCoercionTargetsTestCase): diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index ea076a9967227d..e377254ef5dd54 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -357,14 +357,13 @@ _Py_LegacyLocaleDetected(void) typedef struct _CandidateLocale { const char *locale_name; /* The locale to try as a coercion target */ - int set_LANG; /* Whether to set LANG in addition to LC_CTYPE */ } _LocaleCoercionTarget; static _LocaleCoercionTarget _TARGET_LOCALES[] = { - { "C.UTF-8", 1 }, - { "C.utf8", 1}, - { "UTF-8", 0 }, - { NULL, 0 } + {"C.UTF-8"}, + {"C.utf8"}, + {"UTF-8"}, + {NULL} }; static char * @@ -392,33 +391,24 @@ get_default_standard_stream_error_handler(void) #ifdef PY_COERCE_C_LOCALE static const char *_C_LOCALE_COERCION_WARNING = - "Python detected LC_CTYPE=C: %.20s coerced to %.20s (set another locale " + "Python detected LC_CTYPE=C: LC_CTYPE coerced to %.20s (set another locale " "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior).\n"; static void _coerce_default_locale_settings(const _LocaleCoercionTarget *target) { - const char *env_vars_updated = "LC_CTYPE"; const char *newloc = target->locale_name; /* Reset locale back to currently configured defaults */ setlocale(LC_ALL, ""); - /* Set the relevant locale environment variables */ + /* Set the relevant locale environment variable */ if (setenv("LC_CTYPE", newloc, 1)) { fprintf(stderr, "Error setting LC_CTYPE, skipping C locale coercion\n"); return; } - if (target->set_LANG) { - if (setenv("LANG", newloc, 1) == 0) { - env_vars_updated = "LC_CTYPE & LANG"; - } else { - fprintf(stderr, - "Error setting LANG during C locale coercion\n"); - } - } - fprintf(stderr, _C_LOCALE_COERCION_WARNING, env_vars_updated, newloc); + fprintf(stderr, _C_LOCALE_COERCION_WARNING, newloc); /* Reconfigure with the overridden environment variables */ setlocale(LC_ALL, ""); From 6d564c9e9627a82ca5df5edc49630fd3f4eee55c Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 18:35:43 +1000 Subject: [PATCH 23/36] Update docs to match current behaviour --- Doc/using/cmdline.rst | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 272c124d571671..838e8c546fc40f 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -724,18 +724,20 @@ conflict. If this variable is *not* set, or is set to a value other than ``0``, and the current locale reported for the ``LC_CTYPE`` category is the default - ``C`` locale, then the Python CLI will attempt to configure one of the - following locales for the given locale categories before loading the + ``C`` locale, then the Python CLI will attempt to configure the following + locales for the ``LC_CTYPE`` category in the order listed before loading the interpreter runtime: - * ``C.UTF-8`` (``LC_ALL``) - * ``C.utf8`` (``LC_ALL``) - * ``UTF-8`` (``LC_CTYPE``) + * ``C.UTF-8`` + * ``C.utf8`` + * ``UTF-8`` - If setting one of these locale categories succeeds, then the matching - environment variables will be set (both ``LC_ALL`` and ``LANG`` for the - ``LC_ALL`` category, and ``LC_CTYPE`` for the ``LC_CTYPE`` category) in - the current process environment before the Python runtime is initialized. + If setting one of these locale categories succeeds, then the ``LC_CTYPE`` + environment variable will also be set accordingly in the current process + environment before the Python runtime is initialized. This ensures the + updated setting is seen in subprocesses, as well as in operations that + query the environment rather than the current C locale (such as Python's + own :ref:`locale.getdefaultlocale`). Configuring one of these locales (either explicitly or via the above implicit locale coercion) will automatically set the error handler for From 53bd6da20d55126548e96a6320aff845874fd7ff Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 18:52:37 +1000 Subject: [PATCH 24/36] Address CI failure and review comments - avoid unintended side effects on Windows behaviour - remove a single-use function that made the code harder to follow - clarify the security considerations around ignoring -E and -I when checking PYTHONCOERCECLOCALE --- Python/pylifecycle.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index e377254ef5dd54..3dae29b0c0060e 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -351,8 +351,14 @@ initexternalimport(PyInterpreterState *interp) int _Py_LegacyLocaleDetected(void) { +#ifndef MS_WINDOWS + /* On non-Windows systems, the C locale is considered a legacy locale */ const char *ctype_loc = setlocale(LC_CTYPE, NULL); return ctype_loc != NULL && strcmp(ctype_loc, "C") == 0; +#else + /* Windows uses code pages instead of locales, so no locale is legacy */ + return 0; +#endif } typedef struct _CandidateLocale { @@ -369,6 +375,8 @@ static _LocaleCoercionTarget _TARGET_LOCALES[] = { static char * get_default_standard_stream_error_handler(void) { +#ifndef MS_WINDOWS + /* On non-Windows systems, the locale can affect the default error handler */ const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL) { /* "surrogateescape" is the default in the legacy C locale */ @@ -384,6 +392,7 @@ get_default_standard_stream_error_handler(void) } } } +#endif /* Otherwise return NULL to request the typical default error handler */ return NULL; @@ -413,19 +422,6 @@ _coerce_default_locale_settings(const _LocaleCoercionTarget *target) /* Reconfigure with the overridden environment variables */ setlocale(LC_ALL, ""); } - -static int -c_locale_coercion_is_expected(void) -{ - /* This may be called prior to Py_Initialize, so we don't call any other - * Python APIs, and we ignore the -E and -I flags - */ - const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); - if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { - return 1; - } - return 0; -} #endif void @@ -438,9 +434,15 @@ _Py_CoerceLegacyLocale(void) * to give end users a way to force even scripts that are otherwise * isolated from their environment to use the legacy ASCII-centric C * locale. - */ - if (c_locale_coercion_is_expected()) { - /* PYTHONCOERCECLOCALE is not set, or is not set to exactly "0" */ + * + * Ignoring -E and -I is safe from a security perspective, as we only use + * the setting to turn *off* the implicit locale coercion, and anyone with + * access to the process environment already has the ability to set + * `LC_ALL=C` to override the C level locale settings anyway. + */ + const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); + if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { + /* PYTHONCOERCECLOCALE is not set, or is set to something other than "0" */ const char *locale_override = getenv("LC_ALL"); if (locale_override == NULL || *locale_override == '\0') { /* LC_ALL is also not set (or is set to an empty string) */ From cad06695615e1b0b29914bc9d29f98aa79497bb1 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 18:57:39 +1000 Subject: [PATCH 25/36] OK, two-use function :) The inline check for "Is this env var exactly zero?" is still more self-explanatory than factoring out the helper function. --- Python/pylifecycle.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 3dae29b0c0060e..eb42bff29f282b 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -473,7 +473,8 @@ static const char *_C_LOCALE_WARNING = static void _emit_stderr_warning_for_c_locale(void) { - if (c_locale_coercion_is_expected()) { + const char *coerce_c_locale = getenv("PYTHONCOERCECLOCALE"); + if (coerce_c_locale == NULL || strncmp(coerce_c_locale, "0", 2) != 0) { if (_Py_LegacyLocaleDetected()) { fprintf(stderr, "%s", _C_LOCALE_WARNING); } From 421516f2b278c3ef8b5d3f11cd05d1c4b31ac46e Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 20:10:31 +1000 Subject: [PATCH 26/36] Still check for the C locale in Windows --- Python/pylifecycle.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index eb42bff29f282b..94f1dcf510102b 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -375,8 +375,6 @@ static _LocaleCoercionTarget _TARGET_LOCALES[] = { static char * get_default_standard_stream_error_handler(void) { -#ifndef MS_WINDOWS - /* On non-Windows systems, the locale can affect the default error handler */ const char *ctype_loc = setlocale(LC_CTYPE, NULL); if (ctype_loc != NULL) { /* "surrogateescape" is the default in the legacy C locale */ @@ -384,6 +382,7 @@ get_default_standard_stream_error_handler(void) return "surrogateescape"; } +#ifdef PY_COERCE_C_LOCALE /* "surrogateescape" is the default in locale coercion target locales */ const _LocaleCoercionTarget *target = NULL; for (target = _TARGET_LOCALES; target->locale_name; target++) { @@ -391,8 +390,8 @@ get_default_standard_stream_error_handler(void) return "surrogateescape"; } } - } #endif + } /* Otherwise return NULL to request the typical default error handler */ return NULL; From e48a3787812a89797376988230599df44d92814c Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 20:59:38 +1000 Subject: [PATCH 27/36] Check actual control flow on Appveyor --- Python/pylifecycle.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 94f1dcf510102b..9fd4bfa7d91901 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1617,6 +1617,9 @@ initstdio(void) } if (!errors && !(pythonioencoding && *pythonioencoding)) { /* Choose the default error handler based on the current locale */ +#ifdef MS_WINDOWS + printf("Is this code even being reached under Windows?"); +#endif errors = get_default_standard_stream_error_handler(); } } From d181b92d6a6ae61dd7b3fe5d53db24c1ce1591fa Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 21:25:16 +1000 Subject: [PATCH 28/36] Use correct reference type in docs --- Doc/using/cmdline.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 838e8c546fc40f..920d5c01e4bef4 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -737,7 +737,7 @@ conflict. environment before the Python runtime is initialized. This ensures the updated setting is seen in subprocesses, as well as in operations that query the environment rather than the current C locale (such as Python's - own :ref:`locale.getdefaultlocale`). + own :func:`locale.getdefaultlocale`). Configuring one of these locales (either explicitly or via the above implicit locale coercion) will automatically set the error handler for From 8cf0590dd4e4ce733994ee503f5c3a9c1758bc70 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 21:25:43 +1000 Subject: [PATCH 29/36] More Appveyor debugging --- Python/pylifecycle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 9fd4bfa7d91901..f43215e5d34036 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1617,10 +1617,10 @@ initstdio(void) } if (!errors && !(pythonioencoding && *pythonioencoding)) { /* Choose the default error handler based on the current locale */ + errors = get_default_standard_stream_error_handler(); #ifdef MS_WINDOWS - printf("Is this code even being reached under Windows?"); + printf("Stream error handler: %s\n", errors); #endif - errors = get_default_standard_stream_error_handler(); } } From cea79702b62becd2b60a164af8107896e3a14030 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 21:36:53 +1000 Subject: [PATCH 30/36] New theory regarding the Windows problem A HAVE_SETLOCALE guard was removed when adding a check for __ANDROID__, and that may be affecting the default locale reported on Windows. --- Python/pylifecycle.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index f43215e5d34036..ee92009b6527f6 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -553,6 +553,7 @@ void _Py_InitializeCore(const _PyCoreConfig *config) */ setlocale(LC_CTYPE, "C.UTF-8"); #else +#ifndef MS_WINDOWS /* Set up the LC_CTYPE locale, so we can obtain the locale's charset without having to switch locales. */ @@ -560,6 +561,7 @@ void _Py_InitializeCore(const _PyCoreConfig *config) #ifdef PY_WARN_ON_C_LOCALE _emit_stderr_warning_for_c_locale(); #endif +#endif #endif if ((p = Py_GETENV("PYTHONDEBUG")) && *p != '\0') @@ -1618,9 +1620,6 @@ initstdio(void) if (!errors && !(pythonioencoding && *pythonioencoding)) { /* Choose the default error handler based on the current locale */ errors = get_default_standard_stream_error_handler(); -#ifdef MS_WINDOWS - printf("Stream error handler: %s\n", errors); -#endif } } From c63d5fae545e95f3fe9638167acfa2b5d2777be2 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 21:41:13 +1000 Subject: [PATCH 31/36] Locale coercion may inject LC_CTYPE into environment --- Lib/test/test_subprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/test/test_subprocess.py b/Lib/test/test_subprocess.py index 7fabe6ad765332..52b05c12b641da 100644 --- a/Lib/test/test_subprocess.py +++ b/Lib/test/test_subprocess.py @@ -642,7 +642,8 @@ def is_env_var_to_ignore(n): # on adding even when the environment in exec is empty. # Gentoo sandboxes also force LD_PRELOAD and SANDBOX_* to exist. return ('VERSIONER' in n or '__CF' in n or # MacOS - n == 'LD_PRELOAD' or n.startswith('SANDBOX')) # Gentoo + n == 'LD_PRELOAD' or n.startswith('SANDBOX') or # Gentoo + n == 'LC_CTYPE') # Locale coercion triggered with subprocess.Popen([sys.executable, "-c", 'import os; print(list(os.environ.keys()))'], From 8e0e1ca3bdb4836abc1e31e32b8a861d541d9326 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 22:35:11 +1000 Subject: [PATCH 32/36] Ensure SYSTEMROOT is set in Windows embedding tests --- Lib/test/test_capi.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py index aa84b929123af5..67e430d5577f78 100644 --- a/Lib/test/test_capi.py +++ b/Lib/test/test_capi.py @@ -375,6 +375,12 @@ def run_embedded_interpreter(self, *args, env=None): """Runs a test in the embedded interpreter""" cmd = [self.test_exe] cmd.extend(args) + if env is not None and sys.platform == 'win32': + # Windows requires at least the SYSTEMROOT environment variable to + # start Python. + env = env.copy() + env['SYSTEMROOT'] = os.environ['SYSTEMROOT'] + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, From 737939831bb576f22a8e53d559e1733e07e4a0bb Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 3 Jun 2017 22:55:30 +1000 Subject: [PATCH 33/36] Don't use the default pipe encoding in test_capi --- Lib/test/test_capi.py | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py index 67e430d5577f78..e25590c70e45a2 100644 --- a/Lib/test/test_capi.py +++ b/Lib/test/test_capi.py @@ -478,16 +478,6 @@ def test_subinterps_distinct_state(self): self.assertNotEqual(sub.tstate, main.tstate) self.assertNotEqual(sub.modules, main.modules) - @staticmethod - def _get_default_pipe_encoding(): - rp, wp = os.pipe() - try: - with os.fdopen(wp, 'w') as w: - default_pipe_encoding = w.encoding - finally: - os.close(rp) - return default_pipe_encoding - def test_forced_io_encoding(self): # Checks forced configuration of embedded interpreter IO streams env = {"PYTHONIOENCODING": "UTF-8:surrogateescape"} @@ -496,9 +486,8 @@ def test_forced_io_encoding(self): print() print(out) print(err) + expected_stream_encoding = "UTF-8" expected_errors = "surrogateescape" - expected_stdin_encoding = "UTF-8" - expected_pipe_encoding = self._get_default_pipe_encoding() expected_output = '\n'.join([ "--- Use defaults ---", "Expected encoding: default", @@ -525,8 +514,8 @@ def test_forced_io_encoding(self): "stdout: latin-1:replace", "stderr: latin-1:backslashreplace"]) expected_output = expected_output.format( - in_encoding=expected_stdin_encoding, - out_encoding=expected_pipe_encoding, + in_encoding=expected_stream_encoding, + out_encoding=expected_stream_encoding, errors=expected_errors) # This is useful if we ever trip over odd platform behaviour self.maxDiff = None From 89759b59bfe037c5e0186b3d1d65c273d59a1905 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sun, 4 Jun 2017 00:23:33 +1000 Subject: [PATCH 34/36] stdin encoding ends up normalised on Windows --- Lib/test/test_capi.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_capi.py b/Lib/test/test_capi.py index e25590c70e45a2..c4a97664290949 100644 --- a/Lib/test/test_capi.py +++ b/Lib/test/test_capi.py @@ -480,13 +480,13 @@ def test_subinterps_distinct_state(self): def test_forced_io_encoding(self): # Checks forced configuration of embedded interpreter IO streams - env = {"PYTHONIOENCODING": "UTF-8:surrogateescape"} + env = {"PYTHONIOENCODING": "utf-8:surrogateescape"} out, err = self.run_embedded_interpreter("forced_io_encoding", env=env) if support.verbose > 1: print() print(out) print(err) - expected_stream_encoding = "UTF-8" + expected_stream_encoding = "utf-8" expected_errors = "surrogateescape" expected_output = '\n'.join([ "--- Use defaults ---", From 5a56a3f3ac5f5583cf2083f54b99146cbb418a5f Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sun, 4 Jun 2017 20:13:59 +1000 Subject: [PATCH 35/36] PEP 538: Add What's New entry --- Doc/whatsnew/3.7.rst | 45 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 761c85fd22084b..f019f944974dd3 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -70,6 +70,51 @@ Summary -- Release highlights New Features ============ +.. _whatsnew37-pep538: + +PEP 538: Legacy C Locale Coercion +--------------------------------- + +An ongoing challenge within the Python 3 series has been determining a sensible +default strategy for handling the "7-bit ASCII" text encoding assumption +currently implied by the use of the default C locale on non-Windows platforms. + +:pep:`538` updates the default interpreter command line interface to +automatically coerce that locale to an available UTF-8 based locale as +described in the documentation of the new :envvar:`PYTHONCOERCECLOCALE` +environment variable. Automatically setting ``LC_CTYPE`` this way means that +both the core interpreter and locale-aware C extensions (such as +:mod:`readline`) will assume the use of UTF-8 as the default text encoding, +rather than ASCII. + +The platform support definition in :pep:`11` has also been updated to limit +full text handling support to suitably configured non-ASCII based locales. + +As part of this change, the default error handler for ``stdin`` and ``stdout`` +is now ``surrogateescape`` (rather than ``strict``) when using any of the +defined coercion target locales (currently ``C.UTF-8``, ``C.utf8``, and +``UTF-8``). The default error handler for ``stderr`` continues to be +``backslashreplace``, regardless of locale. + +.. note:: + + In the current implementation, a warning message is printed directly to + ``stderr`` even for successful implicit locale coercion. This gives + redistributors and system integrators the opportunity to determine if they + should be making an environmental change to avoid the need for implicit + coercion at the Python interpreter level. + + However, it's not clear that this is going to be the best approach for + the final 3.7.0 release, and we may end up deciding to disable the warning + by default and provide some way of opting into it at runtime or build time. + + Concrete examples of use cases where it would be preferrable to disable the + warning by default can be noted on :issue:`30565`. + +.. seealso:: + + :pep:`538` -- Coercing the legacy C locale to a UTF-8 based locale + PEP written and implemented by Nick Coghlan. Other Language Changes From 5288662d9e82b4b1fb0db89184d7e4bf8a947d7b Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sun, 11 Jun 2017 12:25:25 +1000 Subject: [PATCH 36/36] Add NEWS entry --- Misc/NEWS | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Misc/NEWS b/Misc/NEWS index e58de824566254..8cbd4632889f7f 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,11 @@ What's New in Python 3.7.0 alpha 1? Core and Builtins ----------------- +- bpo-28180: Implement PEP 538 (legacy C locale coercion). This means that when + a suitable coercion target locale is available, both the core interpreter and + locale-aware C extensions will assume the use of UTF-8 as the default text + encoding, rather than ASCII. + - bpo-30486: Allows setting cell values for __closure__. Patch by Lisa Roach. - bpo-30537: itertools.islice now accepts integer-like objects (having