Skip to content

Commit c84d4d1

Browse files
[3.12] gh-105390: Correctly raise TokenError instead of SyntaxError for tokenize errors (GH-105399) (#105439)
1 parent c607551 commit c84d4d1

File tree

6 files changed

+35
-24
lines changed

6 files changed

+35
-24
lines changed

Doc/library/tokenize.rst

-5
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,6 @@ function it uses to do this is available:
139139
2,
140140
3
141141

142-
Note that unclosed single-quoted strings do not cause an error to be
143-
raised. They are tokenized as :data:`~token.ERRORTOKEN`, followed by the
144-
tokenization of their contents.
145-
146-
147142
.. _tokenize-cli:
148143

149144
Command-Line Usage

Doc/whatsnew/3.12.rst

+6-5
Original file line numberDiff line numberDiff line change
@@ -1489,14 +1489,15 @@ Changes in the Python API
14891489
Additionally, there may be some minor behavioral changes as a consecuence of the
14901490
changes required to support :pep:`701`. Some of these changes include:
14911491

1492-
* Some final ``DEDENT`` tokens are now emitted within the bounds of the
1493-
input. This means that for a file containing 3 lines, the old version of the
1494-
tokenizer returned a ``DEDENT`` token in line 4 whilst the new version returns
1495-
the token in line 3.
1496-
14971492
* The ``type`` attribute of the tokens emitted when tokenizing some invalid Python
14981493
characters such as ``!`` has changed from ``ERRORTOKEN`` to ``OP``.
14991494

1495+
* Incomplete single-line strings now also raise :exc:`tokenize.TokenError` as incomplete
1496+
multiline strings do.
1497+
1498+
* Some incomplete or invalid Python code now raises :exc:`tokenize.TokenError` instead of
1499+
returning arbitrary ``ERRORTOKEN`` tokens when tokenizing it.
1500+
15001501
Build Changes
15011502
=============
15021503

Lib/test/test_tokenize.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
44
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
55
open as tokenize_open, Untokenizer, generate_tokens,
6-
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
6+
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo,
7+
TokenError)
78
from io import BytesIO, StringIO
89
import unittest
910
from textwrap import dedent
@@ -286,7 +287,7 @@ def number_token(s):
286287
for lit in INVALID_UNDERSCORE_LITERALS:
287288
try:
288289
number_token(lit)
289-
except SyntaxError:
290+
except TokenError:
290291
continue
291292
self.assertNotEqual(number_token(lit), lit)
292293

@@ -1379,7 +1380,7 @@ def test_latin1_normalization(self):
13791380
self.assertEqual(found, "iso-8859-1")
13801381

13811382
def test_syntaxerror_latin1(self):
1382-
# Issue 14629: need to raise SyntaxError if the first
1383+
# Issue 14629: need to raise TokenError if the first
13831384
# line(s) have non-UTF-8 characters
13841385
lines = (
13851386
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
@@ -2754,7 +2755,7 @@ def get_tokens(string):
27542755
"]",
27552756
]:
27562757
with self.subTest(case=case):
2757-
self.assertRaises(SyntaxError, get_tokens, case)
2758+
self.assertRaises(TokenError, get_tokens, case)
27582759

27592760
def test_max_indent(self):
27602761
MAXINDENT = 100
@@ -2773,7 +2774,7 @@ def generate_source(indents):
27732774

27742775
invalid = generate_source(MAXINDENT)
27752776
the_input = StringIO(invalid)
2776-
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
2777+
self.assertRaises(IndentationError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
27772778
self.assertRaises(
27782779
IndentationError, compile, invalid, "<string>", "exec"
27792780
)

Lib/tokenize.py

+18-2
Original file line numberDiff line numberDiff line change
@@ -517,14 +517,30 @@ def error(message, filename=None, location=None):
517517
perror("unexpected error: %s" % err)
518518
raise
519519

520+
def _transform_msg(msg):
521+
"""Transform error messages from the C tokenizer into the Python tokenize
522+
523+
The C tokenizer is more picky than the Python one, so we need to massage
524+
the error messages a bit for backwards compatibility.
525+
"""
526+
if "unterminated triple-quoted string literal" in msg:
527+
return "EOF in multi-line string"
528+
return msg
529+
520530
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
521531
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
522532
if encoding is None:
523533
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
524534
else:
525535
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
526-
for info in it:
527-
yield TokenInfo._make(info)
536+
try:
537+
for info in it:
538+
yield TokenInfo._make(info)
539+
except SyntaxError as e:
540+
if type(e) != SyntaxError:
541+
raise e from None
542+
msg = _transform_msg(e.msg)
543+
raise TokenError(msg, (e.lineno, e.offset)) from None
528544

529545

530546
if __name__ == "__main__":
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Correctly raise :exc:`tokenize.TokenError` exceptions instead of
2+
:exc:`SyntaxError` for tokenize errors such as incomplete input. Patch by
3+
Pablo Galindo

Python/Python-tokenize.c

+2-7
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,8 @@ _tokenizer_error(struct tok_state *tok)
8484
msg = "invalid token";
8585
break;
8686
case E_EOF:
87-
if (tok->level > 0) {
88-
PyErr_Format(PyExc_SyntaxError,
89-
"parenthesis '%c' was never closed",
90-
tok->parenstack[tok->level-1]);
91-
} else {
92-
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF while parsing");
93-
}
87+
PyErr_SetString(PyExc_SyntaxError, "unexpected EOF in multi-line statement");
88+
PyErr_SyntaxLocationObject(tok->filename, tok->lineno, tok->inp - tok->buf < 0 ? 0 : tok->inp - tok->buf);
9489
return -1;
9590
case E_DEDENT:
9691
msg = "unindent does not match any outer indentation level";

0 commit comments

Comments
 (0)