Skip to content

gh-53203: Fix strptime() for %c, %x and %X formats on many locales #125406

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 43 additions & 25 deletions Lib/_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import locale
import calendar
from re import compile as re_compile
from re import sub as re_sub
from re import IGNORECASE
from re import escape as re_escape
from datetime import (date as datetime_date,
Expand Down Expand Up @@ -129,11 +130,23 @@ def __calc_date_time(self):
time_tuple = time.struct_time((1999,3,17,22,44,55,2,76,0))
time_tuple2 = time.struct_time((1999,1,3,1,1,1,6,3,0))
replacement_pairs = [
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
('44', '%M'), ('55', '%S'), ('76', '%j'),
('17', '%d'), ('03', '%m'), ('3', '%m'),
# '3' needed for when no leading zero.
('2', '%w'), ('10', '%I')]
('1999', '%Y'), ('99', '%y'), ('22', '%H'),
('44', '%M'), ('55', '%S'), ('76', '%j'),
('17', '%d'), ('03', '%m'), ('3', '%m'),
# '3' needed for when no leading zero.
('2', '%w'), ('10', '%I'),
# Non-ASCII digits
('\u0661\u0669\u0669\u0669', '%Y'),
('\u0669\u0669', '%Oy'),
('\u0662\u0662', '%OH'),
('\u0664\u0664', '%OM'),
('\u0665\u0665', '%OS'),
('\u0661\u0667', '%Od'),
('\u0660\u0663', '%Om'),
('\u0663', '%Om'),
('\u0662', '%Ow'),
('\u0661\u0660', '%OI'),
]
date_time = []
for directive in ('%c', '%x', '%X'):
current_format = time.strftime(directive, time_tuple).lower()
Expand All @@ -158,6 +171,10 @@ def __calc_date_time(self):
for tz in tz_values:
if tz:
current_format = current_format.replace(tz, "%Z")
# Transform all non-ASCII digits to digits in range U+0660 to U+0669.
current_format = re_sub(r'\d(?<![0-9])',
lambda m: chr(0x0660 + int(m[0])),
current_format)
for old, new in replacement_pairs:
current_format = current_format.replace(old, new)
# If %W is used, then Sunday, 2005-01-03 will fall on week 0 since
Expand Down Expand Up @@ -267,7 +284,7 @@ def __init__(self, locale_time=None):
else:
self.locale_time = LocaleTime()
base = super()
base.__init__({
mapping = {
# The " [1-9]" part of the regex is to make %c from ANSI C work
'd': r"(?P<d>3[0-1]|[1-2]\d|0[1-9]|[1-9]| [1-9])",
'f': r"(?P<f>[0-9]{1,6})",
Expand Down Expand Up @@ -296,11 +313,15 @@ def __init__(self, locale_time=None):
'Z': self.__seqToRE((tz for tz_names in self.locale_time.timezone
for tz in tz_names),
'Z'),
'%': '%'})
base.__setitem__('W', base.__getitem__('U').replace('U', 'W'))
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
'%': '%'}
for d in 'dmyHIMS':
mapping['O' + d] = r'(?P<%s>\d\d|\d| \d)' % d
mapping['Ow'] = r'(?P<w>\d)'
mapping['W'] = mapping['U'].replace('U', 'W')
base.__init__(mapping)
base.__setitem__('X', self.pattern(self.locale_time.LC_time))
base.__setitem__('x', self.pattern(self.locale_time.LC_date))
base.__setitem__('c', self.pattern(self.locale_time.LC_date_time))

def __seqToRE(self, to_convert, directive):
"""Convert a list to a regex string for matching a directive.
Expand Down Expand Up @@ -328,28 +349,25 @@ def pattern(self, format):
regex syntax are escaped.

"""
processed_format = ''
# The sub() call escapes all characters that might be misconstrued
# as regex syntax. Cannot use re.escape since we have to deal with
# format directives (%m, etc.).
regex_chars = re_compile(r"([\\.^$*+?\(\){}\[\]|])")
format = regex_chars.sub(r"\\\1", format)
whitespace_replacement = re_compile(r'\s+')
format = whitespace_replacement.sub(r'\\s+', format)
format = re_sub(r"([\\.^$*+?\(\){}\[\]|])", r"\\\1", format)
format = re_sub(r'\s+', r'\\s+', format)
format = re_sub(r"'", "['\u02bc]", format) # needed for br_FR
year_in_format = False
day_of_month_in_format = False
while '%' in format:
directive_index = format.index('%')+1
format_char = format[directive_index]
processed_format = "%s%s%s" % (processed_format,
format[:directive_index-1],
self[format_char])
format = format[directive_index+1:]
def repl(m):
format_char = m[1]
match format_char:
case 'Y' | 'y' | 'G':
nonlocal year_in_format
year_in_format = True
case 'd':
nonlocal day_of_month_in_format
day_of_month_in_format = True
return self[format_char]
format = re_sub(r'%(O?.)', repl, format)
if day_of_month_in_format and not year_in_format:
import warnings
warnings.warn("""\
Expand All @@ -360,7 +378,7 @@ def pattern(self, format):
See https://github.com/python/cpython/issues/70647.""",
DeprecationWarning,
skip_file_prefixes=(os.path.dirname(__file__),))
return "%s%s" % (processed_format, format)
return format

def compile(self, format):
"""Return a compiled re object for the format string."""
Expand Down Expand Up @@ -434,8 +452,8 @@ def _strptime(data_string, format="%a %b %d %H:%M:%S %Y"):
_regex_cache[format] = format_regex
found = format_regex.match(data_string)
if not found:
raise ValueError("time data %r does not match format %r :: /%s/" %
(data_string, format, format_regex.pattern))
raise ValueError("time data %r does not match format %r" %
(data_string, format))
if len(data_string) != found.end():
raise ValueError("unconverted data remains: %s" %
data_string[found.end():])
Expand Down
34 changes: 20 additions & 14 deletions Lib/test/test_strptime.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def test_strptime_exception_context(self):
# additional check for IndexError branch (issue #19545)
with self.assertRaises(ValueError) as e:
_strptime._strptime_time('19', '%Y %')
self.assertIs(e.exception.__suppress_context__, True)
self.assertIsNone(e.exception.__context__)

def test_unconverteddata(self):
# Check ValueError is raised when there is unconverted data
Expand Down Expand Up @@ -485,12 +485,14 @@ def test_bad_timezone(self):
# id_ID, ms_MY.
# * Year is not included: ha_NG.
# * Use non-Gregorian calendar: lo_LA, thai, th_TH.
# On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
#
# BUG: Generates regexp that does not match the current date and time
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
# for lzh_TW.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'eu_ES', 'ar_AE', 'mfe_MU', 'yo_NG',
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
'my_MM', 'or_IN', 'shn_MM', 'az_IR')
def test_date_time_locale(self):
# Test %c directive
loc = locale.getlocale(locale.LC_TIME)[0]
Expand All @@ -512,20 +514,23 @@ def test_date_time_locale(self):
self.roundtrip('%c', slice(0, 6), time.localtime(now - 366*24*3600))

# NB: Dates before 1969 do not roundtrip on some locales:
# bo_CN, bo_IN, dz_BT, eu_ES, eu_FR.
# az_IR, bo_CN, bo_IN, dz_BT, eu_ES, eu_FR, fa_IR, or_IN.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'ar_AE', 'mfe_MU', 'yo_NG',
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN')
'csb_PL', 'br_FR', 'gez_ET', 'brx_IN',
'my_MM', 'shn_MM')
def test_date_time_locale2(self):
# Test %c directive
self.roundtrip('%c', slice(0, 6), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
self.roundtrip('%c', slice(0, 6), (1800, 1, 1, 0, 0, 0, 0, 1, 0))

# NB: Does not roundtrip because use non-Gregorian calendar:
# lo_LA, thai, th_TH.
# lo_LA, thai, th_TH. On Windows: ar_IN, ar_SA, fa_IR, ps_AF.
# BUG: Generates regexp that does not match the current date
# for az_IR, fa_IR, lzh_TW, my_MM, or_IN, shn_MM.
# for lzh_TW.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'he_IL', 'eu_ES', 'ar_AE')
'he_IL', 'eu_ES', 'ar_AE',
'az_IR', 'my_MM', 'or_IN', 'shn_MM')
def test_date_locale(self):
# Test %x directive
now = time.time()
Expand All @@ -545,22 +550,23 @@ def test_date_locale(self):
"musl libc issue on Emscripten, bpo-46390"
)
@run_with_locales('LC_TIME', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'eu_ES', 'ar_AE')
'eu_ES', 'ar_AE', 'my_MM', 'shn_MM')
def test_date_locale2(self):
# Test %x directive
self.roundtrip('%x', slice(0, 3), (1900, 1, 1, 0, 0, 0, 0, 1, 0))
self.roundtrip('%x', slice(0, 3), (1800, 1, 1, 0, 0, 0, 0, 1, 0))

# NB: Does not roundtrip in some locales due to the ambiguity of
# the time representation (bugs in locales?):
# * Seconds are not included: bokmal, ff_SN, nb_NO, nn_NO, no_NO,
# norwegian, nynorsk.
# * Hours are in 12-hour notation without AM/PM indication: hy_AM,
# ms_MY, sm_WS.
# BUG: Generates regexp that does not match the current time for
# aa_DJ, aa_ER, aa_ET, am_ET, az_IR, byn_ER, fa_IR, gez_ER, gez_ET,
# lzh_TW, my_MM, om_ET, om_KE, or_IN, shn_MM, sid_ET, so_DJ, so_ET,
# so_SO, ti_ER, ti_ET, tig_ER, wal_ET.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP')
# BUG: Generates regexp that does not match the current time for lzh_TW.
@run_with_locales('LC_TIME', 'C', 'en_US', 'fr_FR', 'de_DE', 'ja_JP',
'aa_ET', 'am_ET', 'az_IR', 'byn_ER', 'fa_IR', 'gez_ET',
'my_MM', 'om_ET', 'or_IN', 'shn_MM', 'sid_ET', 'so_SO',
'ti_ET', 'tig_ER', 'wal_ET')
def test_time_locale(self):
# Test %X directive
now = time.time()
Expand Down
2 changes: 1 addition & 1 deletion Lib/test/test_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def test_strptime_exception_context(self):
# additional check for IndexError branch (issue #19545)
with self.assertRaises(ValueError) as e:
time.strptime('19', '%Y %')
self.assertIs(e.exception.__suppress_context__, True)
self.assertIsNone(e.exception.__context__)

def test_strptime_leap_year(self):
# GH-70647: warns if parsing a format with a day and no year.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Fix :func:`time.strptime` for ``%c``, ``%x`` and ``%X`` formats in many
locales that use non-ASCII digits, like Persian, Burmese, Odia and Shan.
Loading