Skip to content

Commit c5989cd

Browse files
authored
bpo-34523: Py_DecodeLocale() use UTF-8 on Windows (pythonGH-8998)
Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding on Windows if Py_LegacyWindowsFSEncodingFlag is zero. pymain_read_conf() now sets Py_LegacyWindowsFSEncodingFlag in its loop, but restore its value at exit.
1 parent 70fead2 commit c5989cd

File tree

5 files changed

+55
-30
lines changed

5 files changed

+55
-30
lines changed

Doc/c-api/sys.rst

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ Operating System Utilities
109109
Encoding, highest priority to lowest priority:
110110
111111
* ``UTF-8`` on macOS and Android;
112+
* ``UTF-8`` on Windows if :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero;
112113
* ``UTF-8`` if the Python UTF-8 mode is enabled;
113114
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
114115
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
@@ -140,6 +141,10 @@ Operating System Utilities
140141
.. versionchanged:: 3.7
141142
The function now uses the UTF-8 encoding in the UTF-8 mode.
142143
144+
.. versionchanged:: 3.8
145+
The function now uses the UTF-8 encoding on Windows if
146+
:c:data:`Py_LegacyWindowsFSEncodingFlag` is zero;
147+
143148
144149
.. c:function:: char* Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
145150
@@ -150,6 +155,7 @@ Operating System Utilities
150155
Encoding, highest priority to lowest priority:
151156
152157
* ``UTF-8`` on macOS and Android;
158+
* ``UTF-8`` on Windows if :c:data:`Py_LegacyWindowsFSEncodingFlag` is zero;
153159
* ``UTF-8`` if the Python UTF-8 mode is enabled;
154160
* ``ASCII`` if the ``LC_CTYPE`` locale is ``"C"``,
155161
``nl_langinfo(CODESET)`` returns the ``ASCII`` encoding (or an alias),
@@ -169,9 +175,6 @@ Operating System Utilities
169175
Use the :c:func:`Py_DecodeLocale` function to decode the bytes string back
170176
to a wide character string.
171177
172-
.. versionchanged:: 3.7
173-
The function now uses the UTF-8 encoding in the UTF-8 mode.
174-
175178
.. seealso::
176179
177180
The :c:func:`PyUnicode_EncodeFSDefault` and
@@ -180,7 +183,11 @@ Operating System Utilities
180183
.. versionadded:: 3.5
181184
182185
.. versionchanged:: 3.7
183-
The function now supports the UTF-8 mode.
186+
The function now uses the UTF-8 encoding in the UTF-8 mode.
187+
188+
.. versionchanged:: 3.8
189+
The function now uses the UTF-8 encoding on Windows if
190+
:c:data:`Py_LegacyWindowsFSEncodingFlag` is zero;
184191
185192
186193
.. _systemfunctions:

Lib/test/test_embed.py

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -268,10 +268,10 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
268268
'dump_refs': 0,
269269
'malloc_stats': 0,
270270

271-
# None means that the default encoding is read at runtime:
272-
# see get_locale_encoding().
271+
# None means that the value is get by get_locale_encoding()
273272
'filesystem_encoding': None,
274-
'filesystem_errors': sys.getfilesystemencodeerrors(),
273+
'filesystem_errors': None,
274+
275275
'utf8_mode': 0,
276276
'coerce_c_locale': 0,
277277
'coerce_c_locale_warn': 0,
@@ -294,7 +294,8 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
294294
'quiet': 0,
295295
'user_site_directory': 1,
296296
'buffered_stdio': 1,
297-
# None means that check_config() gets the expected encoding at runtime
297+
298+
# None means that the value is get by get_stdio_encoding()
298299
'stdio_encoding': None,
299300
'stdio_errors': None,
300301

@@ -303,7 +304,6 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase):
303304
'_frozen': 0,
304305
}
305306

306-
307307
def get_stdio_encoding(self, env):
308308
code = 'import sys; print(sys.stdout.encoding, sys.stdout.errors)'
309309
args = (sys.executable, '-c', code)
@@ -315,18 +315,12 @@ def get_stdio_encoding(self, env):
315315
out = proc.stdout.rstrip()
316316
return out.split()
317317

318-
def get_locale_encoding(self, isolated):
319-
if sys.platform in ('win32', 'darwin') or support.is_android:
320-
# Windows, macOS and Android use UTF-8
321-
return "utf-8"
322-
323-
code = ('import codecs, locale, sys',
324-
'locale.setlocale(locale.LC_CTYPE, "")',
325-
'enc = locale.nl_langinfo(locale.CODESET)',
326-
'enc = codecs.lookup(enc).name',
327-
'print(enc)')
328-
args = (sys.executable, '-c', '; '.join(code))
329-
env = dict(os.environ)
318+
def get_filesystem_encoding(self, isolated, env):
319+
code = ('import codecs, locale, sys; '
320+
'print(sys.getfilesystemencoding(), '
321+
'sys.getfilesystemencodeerrors())')
322+
args = (sys.executable, '-c', code)
323+
env = dict(env)
330324
if not isolated:
331325
env['PYTHONCOERCECLOCALE'] = '0'
332326
env['PYTHONUTF8'] = '0'
@@ -336,7 +330,8 @@ def get_locale_encoding(self, isolated):
336330
if proc.returncode:
337331
raise Exception(f"failed to get the locale encoding: "
338332
f"stdout={proc.stdout!r} stderr={proc.stderr!r}")
339-
return proc.stdout.rstrip()
333+
out = proc.stdout.rstrip()
334+
return out.split()
340335

341336
def check_config(self, testname, expected):
342337
expected = dict(self.DEFAULT_CONFIG, **expected)
@@ -356,8 +351,12 @@ def check_config(self, testname, expected):
356351
expected['stdio_encoding'] = res[0]
357352
if expected['stdio_errors'] is None:
358353
expected['stdio_errors'] = res[1]
359-
if expected['filesystem_encoding'] is None:
360-
expected['filesystem_encoding'] = self.get_locale_encoding(expected['isolated'])
354+
if expected['filesystem_encoding'] is None or expected['filesystem_errors'] is None:
355+
res = self.get_filesystem_encoding(expected['isolated'], env)
356+
if expected['filesystem_encoding'] is None:
357+
expected['filesystem_encoding'] = res[0]
358+
if expected['filesystem_errors'] is None:
359+
expected['filesystem_errors'] = res[1]
361360
for key, value in expected.items():
362361
expected[key] = str(value)
363362

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Py_DecodeLocale() and Py_EncodeLocale() now use the UTF-8 encoding on
2+
Windows if Py_LegacyWindowsFSEncodingFlag is zero.

Modules/main.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1287,6 +1287,9 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config,
12871287
_PyCmdline *cmdline)
12881288
{
12891289
int init_utf8_mode = Py_UTF8Mode;
1290+
#ifdef MS_WINDOWS
1291+
int init_legacy_encoding = Py_LegacyWindowsFSEncodingFlag;
1292+
#endif
12901293
_PyCoreConfig save_config = _PyCoreConfig_INIT;
12911294
int res = -1;
12921295

@@ -1313,9 +1316,12 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config,
13131316
goto done;
13141317
}
13151318

1316-
/* bpo-34207: Py_DecodeLocale(), Py_EncodeLocale() and similar
1317-
functions depend on Py_UTF8Mode. */
1319+
/* bpo-34207: Py_DecodeLocale() and Py_EncodeLocale() depend
1320+
on Py_UTF8Mode and Py_LegacyWindowsFSEncodingFlag. */
13181321
Py_UTF8Mode = config->utf8_mode;
1322+
#ifdef MS_WINDOWS
1323+
Py_LegacyWindowsFSEncodingFlag = config->legacy_windows_fs_encoding;
1324+
#endif
13191325

13201326
if (pymain_init_cmdline_argv(pymain, config, cmdline) < 0) {
13211327
goto done;
@@ -1380,6 +1386,9 @@ pymain_read_conf(_PyMain *pymain, _PyCoreConfig *config,
13801386
done:
13811387
_PyCoreConfig_Clear(&save_config);
13821388
Py_UTF8Mode = init_utf8_mode ;
1389+
#ifdef MS_WINDOWS
1390+
Py_LegacyWindowsFSEncodingFlag = init_legacy_encoding;
1391+
#endif
13831392
return res;
13841393
}
13851394

Python/fileutils.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -499,9 +499,13 @@ _Py_DecodeLocaleEx(const char* arg, wchar_t **wstr, size_t *wlen,
499499
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
500500
surrogateescape);
501501
#else
502-
if (Py_UTF8Mode == 1) {
503-
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen, reason,
504-
surrogateescape);
502+
int use_utf8 = (Py_UTF8Mode == 1);
503+
#ifdef MS_WINDOWS
504+
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
505+
#endif
506+
if (use_utf8) {
507+
return _Py_DecodeUTF8Ex(arg, strlen(arg), wstr, wlen,
508+
reason, surrogateescape);
505509
}
506510

507511
#ifdef USE_FORCE_ASCII
@@ -661,7 +665,11 @@ encode_locale_ex(const wchar_t *text, char **str, size_t *error_pos,
661665
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
662666
raw_malloc, surrogateescape);
663667
#else /* __APPLE__ */
664-
if (Py_UTF8Mode == 1) {
668+
int use_utf8 = (Py_UTF8Mode == 1);
669+
#ifdef MS_WINDOWS
670+
use_utf8 |= !Py_LegacyWindowsFSEncodingFlag;
671+
#endif
672+
if (use_utf8) {
665673
return _Py_EncodeUTF8Ex(text, str, error_pos, reason,
666674
raw_malloc, surrogateescape);
667675
}

0 commit comments

Comments
 (0)