Skip to content

Commit 786defc

Browse files
committed
Improve some code comments, refactor some code, mention length checks in the README
1 parent 814b488 commit 786defc

File tree

4 files changed

+111
-77
lines changed

4 files changed

+111
-77
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,14 @@ they are unnecessary. For IPv6 domain literals, the IPv6 address is
315315
normalized to condensed form. [RFC 2142](https://datatracker.ietf.org/doc/html/rfc2142)
316316
also requires lowercase normalization for some specific mailbox names like `postmaster@`.
317317

318+
### Length checks
319+
320+
This library checks that the length of the email address is not longer than
321+
the maximum length. The check is performed on the normalized form of the
322+
address, which might be different from a string provided by a user. If you
323+
send email to the original string and not the normalized address, the email
324+
might be rejected because the original address could be too long.
325+
318326
Examples
319327
--------
320328

email_validator/syntax.py

Lines changed: 76 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from .exceptions_types import EmailSyntaxError
22
from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
33
DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \
4-
DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS
4+
DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS, \
5+
QUOTED_LOCAL_PART_ADDR
56

67
import re
78
import unicodedata
@@ -10,6 +11,35 @@
1011
from typing import Optional
1112

1213

14+
def split_email(email):
15+
# Return the local part and domain part of the address and
16+
# whether the local part was quoted as a three-tuple.
17+
18+
# Typical email addresses have a single @-sign, but the
19+
# awkward "quoted string" local part form (RFC 5321 4.1.2)
20+
# allows @-signs (and escaped quotes) to appear in the local
21+
# part if the local part is quoted. If the address is quoted,
22+
# split it at a non-escaped @-sign and unescape the escaping.
23+
if m := QUOTED_LOCAL_PART_ADDR.match(email):
24+
local_part, domain_part = m.groups()
25+
26+
# Since backslash-escaping is no longer needed because
27+
# the quotes are removed, remove backslash-escaping
28+
# to return in the normalized form.
29+
import re
30+
local_part = re.sub(r"\\(.)", "\\1", local_part)
31+
32+
return local_part, domain_part, True
33+
34+
else:
35+
# Split at the one and only at-sign.
36+
parts = email.split('@')
37+
if len(parts) != 2:
38+
raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.")
39+
local_part, domain_part = parts
40+
return local_part, domain_part, False
41+
42+
1343
def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH):
1444
"""Helper function to return an error message related to invalid length."""
1545
diff = len(addr) - limit
@@ -367,7 +397,7 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera
367397
raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.")
368398

369399
if globally_deliverable:
370-
# All publicly deliverable addresses have domain named with at least
400+
# All publicly deliverable addresses have domain names with at least
371401
# one period, at least for gTLDs created since 2013 (per the ICANN Board
372402
# New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
373403
# We'll consider the lack of a period a syntax error
@@ -428,7 +458,48 @@ def validate_email_domain_name(domain, test_environment=False, globally_delivera
428458
}
429459

430460

431-
def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
461+
def validate_email_length(addrinfo):
462+
# If the email address has an ASCII representation, then we assume it may be
463+
# transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to
464+
# the destination) and the length limit applies to ASCII characters (which is
465+
# the same as octets). The number of characters in the internationalized form
466+
# may be many fewer (because IDNA ASCII is verbose) and could be less than 254
467+
# Unicode characters, and of course the number of octets over the limit may
468+
# not be the number of characters over the limit, so if the email address is
469+
# internationalized, we can't give any simple information about why the address
470+
# is too long.
471+
if addrinfo.ascii_email and len(addrinfo.ascii_email) > EMAIL_MAX_LENGTH:
472+
if addrinfo.ascii_email == addrinfo.normalized:
473+
reason = get_length_reason(addrinfo.ascii_email)
474+
elif len(addrinfo.normalized) > EMAIL_MAX_LENGTH:
475+
# If there are more than 254 characters, then the ASCII
476+
# form is definitely going to be too long.
477+
reason = get_length_reason(addrinfo.normalized, utf8=True)
478+
else:
479+
reason = "(when converted to IDNA ASCII)"
480+
raise EmailSyntaxError(f"The email address is too long {reason}.")
481+
482+
# In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not
483+
# Unicode characters) is at most 254 octets. If the addres is transmitted using
484+
# SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets.
485+
# If the email address has an ASCII form that differs from its internationalized
486+
# form, I don't think the internationalized form can be longer, and so the ASCII
487+
# form length check would be sufficient. If there is no ASCII form, then we have
488+
# to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times
489+
# longer than the number of characters.
490+
#
491+
# See the length checks on the local part and the domain.
492+
if len(addrinfo.normalized.encode("utf8")) > EMAIL_MAX_LENGTH:
493+
if len(addrinfo.normalized) > EMAIL_MAX_LENGTH:
494+
# If there are more than 254 characters, then the UTF-8
495+
# encoding is definitely going to be too long.
496+
reason = get_length_reason(addrinfo.normalized, utf8=True)
497+
else:
498+
reason = "(when encoded in bytes)"
499+
raise EmailSyntaxError(f"The email address is too long {reason}.")
500+
501+
502+
def validate_email_domain_literal(domain_literal):
432503
# This is obscure domain-literal syntax. Parse it and return
433504
# a compressed/normalized address.
434505
# RFC 5321 4.1.3 and RFC 5322 3.4.1.
@@ -441,8 +512,6 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
441512
addr = ipaddress.IPv4Address(domain_literal)
442513
except ValueError as e:
443514
raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.")
444-
if not allow_domain_literal:
445-
raise EmailSyntaxError("A bracketed IPv4 address after the @-sign is not allowed here.")
446515

447516
# Return the IPv4Address object and the domain back unchanged.
448517
return {
@@ -456,8 +525,6 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
456525
addr = ipaddress.IPv6Address(domain_literal[5:])
457526
except ValueError as e:
458527
raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).")
459-
if not allow_domain_literal:
460-
raise EmailSyntaxError("A bracketed IPv6 address after the @-sign is not allowed here.")
461528

462529
# Return the IPv6Address object and construct a normalized
463530
# domain literal.
@@ -466,6 +533,8 @@ def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
466533
"domain": f"[IPv6:{addr.compressed}]",
467534
}
468535

536+
# Nothing else is valid.
537+
469538
if ":" not in domain_literal:
470539
raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.")
471540

email_validator/validate_email.py

Lines changed: 25 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from typing import Optional, Union
22

33
from .exceptions_types import EmailSyntaxError, ValidatedEmail
4-
from .syntax import validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, get_length_reason
5-
from .rfc_constants import EMAIL_MAX_LENGTH, QUOTED_LOCAL_PART_ADDR, CASE_INSENSITIVE_MAILBOX_NAMES
4+
from .syntax import split_email, validate_email_local_part, validate_email_domain_name, validate_email_domain_literal, validate_email_length
5+
from .rfc_constants import CASE_INSENSITIVE_MAILBOX_NAMES
66

77

88
def validate_email(
@@ -20,9 +20,9 @@ def validate_email(
2020
dns_resolver: Optional[object] = None
2121
) -> ValidatedEmail:
2222
"""
23-
Validates an email address, raising an EmailNotValidError if the address is not valid or returning a dict of
24-
information when the address is valid. The email argument can be a str or a bytes instance,
25-
but if bytes it must be ASCII-only. This is the main method of this library.
23+
Given an email address, and some options, returns a ValidatedEmail instance
24+
with information about the address if it is valid or, if the address is not
25+
valid, raises an EmailNotValidError. This is the main function of the module.
2626
"""
2727

2828
# Fill in default values of arguments.
@@ -52,26 +52,13 @@ def validate_email(
5252
except ValueError:
5353
raise EmailSyntaxError("The email address is not valid ASCII.")
5454

55-
# Typical email addresses have a single @-sign, but the
56-
# awkward "quoted string" local part form (RFC 5321 4.1.2)
57-
# allows @-signs (and escaped quotes) to appear in the local
58-
# part if the local part is quoted. If the address is quoted,
59-
# split it at a non-escaped @-sign and unescape the escaping.
60-
quoted_local_part = False
61-
if m := QUOTED_LOCAL_PART_ADDR.match(email):
62-
quoted_local_part = True
63-
local_part, domain_part = m.groups()
64-
65-
# Remove backslashes.
66-
import re
67-
local_part = re.sub(r"\\(.)", "\\1", local_part)
68-
69-
else:
70-
# Split at the one and only at-sign.
71-
parts = email.split('@')
72-
if len(parts) != 2:
73-
raise EmailSyntaxError("The email address is not valid. It must have exactly one @-sign.")
74-
local_part, domain_part = parts
55+
# Split the address into the local part (before the @-sign)
56+
# and the domain part (after the @-sign). Normally, there
57+
# is only one @-sign. But the awkward "quoted string" local
58+
# part form (RFC 5321 4.1.2) allows @-signs in the local
59+
# part if the local part is quoted.
60+
local_part, domain_part, is_quoted_local_part \
61+
= split_email(email)
7562

7663
# Collect return values in this instance.
7764
ret = ValidatedEmail()
@@ -84,13 +71,17 @@ def validate_email(
8471
local_part_info = validate_email_local_part(local_part,
8572
allow_smtputf8=allow_smtputf8,
8673
allow_empty_local=allow_empty_local,
87-
quoted_local_part=quoted_local_part)
88-
if quoted_local_part and not allow_quoted_local:
89-
raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.")
74+
quoted_local_part=is_quoted_local_part)
9075
ret.local_part = local_part_info["local_part"]
9176
ret.ascii_local_part = local_part_info["ascii_local_part"]
9277
ret.smtputf8 = local_part_info["smtputf8"]
9378

79+
# If a quoted local part isn't allowed but is present, now raise an exception.
80+
# This is done after any exceptions raised by validate_email_local_part so
81+
# that mandatory checks have highest precedence.
82+
if is_quoted_local_part and not allow_quoted_local:
83+
raise EmailSyntaxError("Quoting the part before the @-sign is not allowed here.")
84+
9485
# Some local parts are required to be case-insensitive, so we should normalize
9586
# to lowercase.
9687
# RFC 2142
@@ -107,7 +98,9 @@ def validate_email(
10798

10899
elif domain_part.startswith("[") and domain_part.endswith("]"):
109100
# Parse the address in the domain literal and get back a normalized domain.
110-
domain_part_info = validate_email_domain_literal(domain_part[1:-1], allow_domain_literal=allow_domain_literal)
101+
domain_part_info = validate_email_domain_literal(domain_part[1:-1])
102+
if not allow_domain_literal:
103+
raise EmailSyntaxError("A bracketed IP address after the @-sign is not allowed here.")
111104
ret.domain = domain_part_info["domain"]
112105
ret.ascii_domain = domain_part_info["domain"] # Domain literals are always ASCII.
113106
ret.domain_address = domain_part_info["domain_address"]
@@ -131,48 +124,12 @@ def validate_email(
131124
else:
132125
ret.ascii_email = None
133126

134-
# If the email address has an ASCII representation, then we assume it may be
135-
# transmitted in ASCII (we can't assume SMTPUTF8 will be used on all hops to
136-
# the destination) and the length limit applies to ASCII characters (which is
137-
# the same as octets). The number of characters in the internationalized form
138-
# may be many fewer (because IDNA ASCII is verbose) and could be less than 254
139-
# Unicode characters, and of course the number of octets over the limit may
140-
# not be the number of characters over the limit, so if the email address is
141-
# internationalized, we can't give any simple information about why the address
142-
# is too long.
143-
#
144-
# In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not
145-
# Unicode characters) is at most 254 octets. If the addres is transmitted using
146-
# SMTPUTF8, then the length limit probably applies to the UTF-8 encoded octets.
147-
# If the email address has an ASCII form that differs from its internationalized
148-
# form, I don't think the internationalized form can be longer, and so the ASCII
149-
# form length check would be sufficient. If there is no ASCII form, then we have
150-
# to check the UTF-8 encoding. The UTF-8 encoding could be up to about four times
151-
# longer than the number of characters.
152-
#
153-
# See the length checks on the local part and the domain.
154-
if ret.ascii_email and len(ret.ascii_email) > EMAIL_MAX_LENGTH:
155-
if ret.ascii_email == ret.normalized:
156-
reason = get_length_reason(ret.ascii_email)
157-
elif len(ret.normalized) > EMAIL_MAX_LENGTH:
158-
# If there are more than 254 characters, then the ASCII
159-
# form is definitely going to be too long.
160-
reason = get_length_reason(ret.normalized, utf8=True)
161-
else:
162-
reason = "(when converted to IDNA ASCII)"
163-
raise EmailSyntaxError(f"The email address is too long {reason}.")
164-
if len(ret.normalized.encode("utf8")) > EMAIL_MAX_LENGTH:
165-
if len(ret.normalized) > EMAIL_MAX_LENGTH:
166-
# If there are more than 254 characters, then the UTF-8
167-
# encoding is definitely going to be too long.
168-
reason = get_length_reason(ret.normalized, utf8=True)
169-
else:
170-
reason = "(when encoded in bytes)"
171-
raise EmailSyntaxError(f"The email address is too long {reason}.")
127+
# Check the length of the address.
128+
validate_email_length(ret)
172129

173130
if check_deliverability and not test_environment:
174131
# Validate the email address's deliverability using DNS
175-
# and update the return dict with metadata.
132+
# and update the returned ValidatedEmail object with metadata.
176133

177134
if is_domain_literal:
178135
# There is nothing to check --- skip deliverability checks.

tests/test_syntax.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,9 +330,9 @@ def test_domain_literal():
330330
('me@xn--0.tld', 'The part after the @-sign is not valid IDNA (Invalid A-label).'),
331331
('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'),
332332
('me@yy--0.tld', 'An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.'),
333-
('me@[127.0.0.1]', 'A bracketed IPv4 address after the @-sign is not allowed here.'),
333+
('me@[127.0.0.1]', 'A bracketed IP address after the @-sign is not allowed here.'),
334334
('me@[127.0.0.999]', 'The address in brackets after the @-sign is not valid: It is not an IPv4 address (Octet 999 (> 255) not permitted in \'127.0.0.999\') or is missing an address literal tag.'),
335-
('me@[IPv6:::1]', 'A bracketed IPv6 address after the @-sign is not allowed here.'),
335+
('me@[IPv6:::1]', 'A bracketed IP address after the @-sign is not allowed here.'),
336336
('me@[IPv6:::G]', 'The IPv6 address in brackets after the @-sign is not valid (Only hex digits permitted in \'G\' in \'::G\').'),
337337
('me@[tag:text]', 'The part after the @-sign contains an invalid address literal tag in brackets.'),
338338
('me@[untaggedtext]', 'The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.'),

0 commit comments

Comments
 (0)