Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Lib/encodings/aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@
'ms932' : 'cp932',
'mskanji' : 'cp932',
'ms_kanji' : 'cp932',
'windows_31j' : 'cp932',

# cp949 codec
'949' : 'cp949',
Expand Down
166 changes: 118 additions & 48 deletions Lib/encodings/idna.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
sace_prefix = "xn--"

# This assumes query strings, so AllowUnassigned is true
def nameprep(label):
def nameprep(label): # type: (str) -> str
# Map
newlabel = []
for c in label:
Expand All @@ -25,7 +25,7 @@ def nameprep(label):
label = unicodedata.normalize("NFKC", label)

# Prohibit
for c in label:
for i, c in enumerate(label):
if stringprep.in_table_c12(c) or \
stringprep.in_table_c22(c) or \
stringprep.in_table_c3(c) or \
Expand All @@ -35,7 +35,7 @@ def nameprep(label):
stringprep.in_table_c7(c) or \
stringprep.in_table_c8(c) or \
stringprep.in_table_c9(c):
raise UnicodeError("Invalid character %r" % c)
raise UnicodeEncodeError("idna", label, i, i+1, f"Invalid character {c!r}")

# Check bidi
RandAL = [stringprep.in_table_d1(x) for x in label]
Expand All @@ -46,59 +46,73 @@ def nameprep(label):
# This is table C.8, which was already checked
# 2) If a string contains any RandALCat character, the string
# MUST NOT contain any LCat character.
if any(stringprep.in_table_d2(x) for x in label):
raise UnicodeError("Violation of BIDI requirement 2")
for i, x in enumerate(label):
if stringprep.in_table_d2(x):
raise UnicodeEncodeError("idna", label, i, i+1,
"Violation of BIDI requirement 2")
# 3) If a string contains any RandALCat character, a
# RandALCat character MUST be the first character of the
# string, and a RandALCat character MUST be the last
# character of the string.
if not RandAL[0] or not RandAL[-1]:
raise UnicodeError("Violation of BIDI requirement 3")
if not RandAL[0]:
raise UnicodeEncodeError("idna", label, 0, 1,
"Violation of BIDI requirement 3")
if not RandAL[-1]:
raise UnicodeEncodeError("idna", label, len(label)-1, len(label),
"Violation of BIDI requirement 3")

return label

def ToASCII(label):
def ToASCII(label): # type: (str) -> bytes
try:
# Step 1: try ASCII
label = label.encode("ascii")
except UnicodeError:
label_ascii = label.encode("ascii")
except UnicodeEncodeError:
pass
else:
# Skip to step 3: UseSTD3ASCIIRules is false, so
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
if 0 < len(label_ascii) < 64:
return label_ascii
if len(label) == 0:
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
else:
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

# Step 2: nameprep
label = nameprep(label)

# Step 3: UseSTD3ASCIIRules is false
# Step 4: try ASCII
try:
label = label.encode("ascii")
except UnicodeError:
label_ascii = label.encode("ascii")
except UnicodeEncodeError:
pass
else:
# Skip to step 8.
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
return label_ascii
if len(label) == 0:
raise UnicodeEncodeError("idna", label, 0, 1, "label empty")
else:
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

# Step 5: Check ACE prefix
if label.startswith(sace_prefix):
raise UnicodeError("Label starts with ACE prefix")
if label.lower().startswith(sace_prefix):
raise UnicodeEncodeError(
"idna", label, 0, len(sace_prefix), "Label starts with ACE prefix")

# Step 6: Encode with PUNYCODE
label = label.encode("punycode")
label_ascii = label.encode("punycode")

# Step 7: Prepend ACE prefix
label = ace_prefix + label
label_ascii = ace_prefix + label_ascii

# Step 8: Check size
if 0 < len(label) < 64:
return label
raise UnicodeError("label empty or too long")
# do not check for empty as we prepend ace_prefix.
if len(label_ascii) < 64:
return label_ascii
raise UnicodeEncodeError("idna", label, 0, len(label), "label too long")

def ToUnicode(label):
if len(label) > 1024:
Expand All @@ -110,41 +124,51 @@ def ToUnicode(label):
# per https://www.rfc-editor.org/rfc/rfc3454#section-3.1 while still
# preventing us from wasting time decoding a big thing that'll just
# hit the actual <= 63 length limit in Step 6.
raise UnicodeError("label way too long")
if isinstance(label, str):
label = label.encode("utf-8", errors="backslashreplace")
raise UnicodeDecodeError("idna", label, 0, len(label), "label way too long")
# Step 1: Check for ASCII
if isinstance(label, bytes):
pure_ascii = True
else:
try:
label = label.encode("ascii")
pure_ascii = True
except UnicodeError:
except UnicodeEncodeError:
pure_ascii = False
if not pure_ascii:
assert isinstance(label, str)
# Step 2: Perform nameprep
label = nameprep(label)
# It doesn't say this, but apparently, it should be ASCII now
try:
label = label.encode("ascii")
except UnicodeError:
raise UnicodeError("Invalid character in IDN label")
except UnicodeEncodeError as exc:
raise UnicodeEncodeError("idna", label, exc.start, exc.end,
"Invalid character in IDN label")
# Step 3: Check for ACE prefix
if not label.startswith(ace_prefix):
assert isinstance(label, bytes)
if not label.lower().startswith(ace_prefix):
return str(label, "ascii")

# Step 4: Remove ACE prefix
label1 = label[len(ace_prefix):]

# Step 5: Decode using PUNYCODE
result = label1.decode("punycode")
try:
result = label1.decode("punycode")
except UnicodeDecodeError as exc:
offset = len(ace_prefix)
raise UnicodeDecodeError("idna", label, offset+exc.start, offset+exc.end, exc.reason)

# Step 6: Apply ToASCII
label2 = ToASCII(result)

# Step 7: Compare the result of step 6 with the one of step 3
# label2 will already be in lower case.
if str(label, "ascii").lower() != str(label2, "ascii"):
raise UnicodeError("IDNA does not round-trip", label, label2)
raise UnicodeDecodeError("idna", label, 0, len(label),
f"IDNA does not round-trip, '{label!r}' != '{label2!r}'")

# Step 8: return the result of step 5
return result
Expand All @@ -156,7 +180,7 @@ def encode(self, input, errors='strict'):

if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError("unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return b'', 0
Expand All @@ -168,11 +192,16 @@ def encode(self, input, errors='strict'):
else:
# ASCII name: fast path
labels = result.split(b'.')
for label in labels[:-1]:
if not (0 < len(label) < 64):
raise UnicodeError("label empty or too long")
if len(labels[-1]) >= 64:
raise UnicodeError("label too long")
for i, label in enumerate(labels[:-1]):
if len(label) == 0:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError("idna", input, offset, offset+1,
"label empty")
for i, label in enumerate(labels):
if len(label) >= 64:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError("idna", input, offset, offset+len(label),
"label too long")
return result, len(input)

result = bytearray()
Expand All @@ -182,17 +211,27 @@ def encode(self, input, errors='strict'):
del labels[-1]
else:
trailing_dot = b''
for label in labels:
for i, label in enumerate(labels):
if result:
# Join with U+002E
result.extend(b'.')
result.extend(ToASCII(label))
try:
result.extend(ToASCII(label))
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
offset = sum(len(l) for l in labels[:i]) + i
raise UnicodeEncodeError(
"idna",
input,
offset + exc.start,
offset + exc.end,
exc.reason,
)
return bytes(result+trailing_dot), len(input)

def decode(self, input, errors='strict'):

if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return "", 0
Expand All @@ -202,7 +241,7 @@ def decode(self, input, errors='strict'):
# XXX obviously wrong, see #3232
input = bytes(input)

if ace_prefix not in input:
if ace_prefix not in input.lower():
# Fast path
try:
return input.decode('ascii'), len(input)
Expand All @@ -218,16 +257,23 @@ def decode(self, input, errors='strict'):
trailing_dot = ''

result = []
for label in labels:
result.append(ToUnicode(label))
for i, label in enumerate(labels):
try:
u_label = ToUnicode(label)
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
offset = sum(len(x) for x in labels[:i]) + len(labels[:i])
raise UnicodeDecodeError(
"idna", input, offset+exc.start, offset+exc.end, exc.reason)
else:
result.append(u_label)

return ".".join(result)+trailing_dot, len(input)

class IncrementalEncoder(codecs.BufferedIncrementalEncoder):
def _buffer_encode(self, input, errors, final):
if errors != 'strict':
# IDNA is quite clear that implementations must be strict
raise UnicodeError("unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return (b'', 0)
Expand All @@ -251,7 +297,16 @@ def _buffer_encode(self, input, errors, final):
# Join with U+002E
result.extend(b'.')
size += 1
result.extend(ToASCII(label))
try:
result.extend(ToASCII(label))
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeEncodeError(
"idna",
input,
size + exc.start,
size + exc.end,
exc.reason,
)
size += len(label)

result += trailing_dot
Expand All @@ -261,7 +316,7 @@ def _buffer_encode(self, input, errors, final):
class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
def _buffer_decode(self, input, errors, final):
if errors != 'strict':
raise UnicodeError("Unsupported error handling "+errors)
raise UnicodeError(f"Unsupported error handling: {errors}")

if not input:
return ("", 0)
Expand All @@ -271,7 +326,11 @@ def _buffer_decode(self, input, errors, final):
labels = dots.split(input)
else:
# Must be ASCII string
input = str(input, "ascii")
try:
input = str(input, "ascii")
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeDecodeError("idna", input,
exc.start, exc.end, exc.reason)
labels = input.split(".")

trailing_dot = ''
Expand All @@ -288,7 +347,18 @@ def _buffer_decode(self, input, errors, final):
result = []
size = 0
for label in labels:
result.append(ToUnicode(label))
try:
u_label = ToUnicode(label)
except (UnicodeEncodeError, UnicodeDecodeError) as exc:
raise UnicodeDecodeError(
"idna",
input.encode("ascii", errors="backslashreplace"),
size + exc.start,
size + exc.end,
exc.reason,
)
else:
result.append(u_label)
if size:
size += 1
size += len(label)
Expand Down
2 changes: 1 addition & 1 deletion Lib/encodings/palmos.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def getregentry():
'\u02dc' # 0x98 -> SMALL TILDE
'\u2122' # 0x99 -> TRADE MARK SIGN
'\u0161' # 0x9A -> LATIN SMALL LETTER S WITH CARON
'\x9b' # 0x9B -> <control>
'\u203a' # 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
'\u0153' # 0x9C -> LATIN SMALL LIGATURE OE
'\x9d' # 0x9D -> <control>
'\x9e' # 0x9E -> <control>
Expand Down
Loading
Loading