Skip to content

Commit c687946

Browse files
[3.12] gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (GH-105070) (#105119)
gh-105069: Add a readline-like callable to the tokenizer to consume input iteratively (GH-105070) (cherry picked from commit 9216e69) Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
1 parent 2f8c22f commit c687946

File tree

7 files changed

+274
-96
lines changed

7 files changed

+274
-96
lines changed

Lib/inspect.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2203,7 +2203,7 @@ def _signature_strip_non_python_syntax(signature):
22032203
add(string)
22042204
if (string == ','):
22052205
add(' ')
2206-
clean_signature = ''.join(text).strip()
2206+
clean_signature = ''.join(text).strip().replace("\n", "")
22072207
return clean_signature, self_parameter
22082208

22092209

Lib/test/test_tokenize.py

Lines changed: 96 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from test import support
22
from test.support import os_helper
3-
from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3+
from tokenize import (tokenize, untokenize, NUMBER, NAME, OP,
44
STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
55
open as tokenize_open, Untokenizer, generate_tokens,
66
NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT, TokenInfo)
@@ -51,6 +51,25 @@ def check_tokenize(self, s, expected):
5151
[" ENCODING 'utf-8' (0, 0) (0, 0)"] +
5252
expected.rstrip().splitlines())
5353

54+
def test_invalid_readline(self):
55+
def gen():
56+
yield "sdfosdg"
57+
yield "sdfosdg"
58+
with self.assertRaises(TypeError):
59+
list(tokenize(gen().__next__))
60+
61+
def gen():
62+
yield b"sdfosdg"
63+
yield b"sdfosdg"
64+
with self.assertRaises(TypeError):
65+
list(generate_tokens(gen().__next__))
66+
67+
def gen():
68+
yield "sdfosdg"
69+
1/0
70+
with self.assertRaises(ZeroDivisionError):
71+
list(generate_tokens(gen().__next__))
72+
5473
def test_implicit_newline(self):
5574
# Make sure that the tokenizer puts in an implicit NEWLINE
5675
# when the input lacks a trailing new line.
@@ -1161,7 +1180,8 @@ class TestTokenizerAdheresToPep0263(TestCase):
11611180

11621181
def _testFile(self, filename):
11631182
path = os.path.join(os.path.dirname(__file__), filename)
1164-
TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
1183+
with open(path, 'rb') as f:
1184+
TestRoundtrip.check_roundtrip(self, f)
11651185

11661186
def test_utf8_coding_cookie_and_no_utf8_bom(self):
11671187
f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
@@ -1206,7 +1226,8 @@ def readline():
12061226
yield b''
12071227

12081228
# skip the initial encoding token and the end tokens
1209-
tokens = list(_tokenize(readline(), encoding='utf-8'))[:-2]
1229+
tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8',
1230+
extra_tokens=True))[:-2]
12101231
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
12111232
self.assertEqual(tokens, expected_tokens,
12121233
"bytes not decoded with encoding")
@@ -1475,13 +1496,13 @@ def test_tokenize(self):
14751496
def mock_detect_encoding(readline):
14761497
return encoding, [b'first', b'second']
14771498

1478-
def mock__tokenize(readline, encoding):
1499+
def mock__tokenize(readline, encoding, **kwargs):
14791500
nonlocal encoding_used
14801501
encoding_used = encoding
14811502
out = []
14821503
while True:
14831504
try:
1484-
next_line = next(readline)
1505+
next_line = readline()
14851506
except StopIteration:
14861507
return out
14871508
if next_line:
@@ -1498,16 +1519,16 @@ def mock_readline():
14981519
return str(counter).encode()
14991520

15001521
orig_detect_encoding = tokenize_module.detect_encoding
1501-
orig__tokenize = tokenize_module._tokenize
1522+
orig_c_token = tokenize_module._generate_tokens_from_c_tokenizer
15021523
tokenize_module.detect_encoding = mock_detect_encoding
1503-
tokenize_module._tokenize = mock__tokenize
1524+
tokenize_module._generate_tokens_from_c_tokenizer = mock__tokenize
15041525
try:
15051526
results = tokenize(mock_readline)
15061527
self.assertEqual(list(results)[1:],
15071528
[b'first', b'second', b'1', b'2', b'3', b'4'])
15081529
finally:
15091530
tokenize_module.detect_encoding = orig_detect_encoding
1510-
tokenize_module._tokenize = orig__tokenize
1531+
tokenize_module._generate_tokens_from_c_tokenizer = orig_c_token
15111532

15121533
self.assertEqual(encoding_used, encoding)
15131534

@@ -1834,12 +1855,33 @@ class CTokenizeTest(TestCase):
18341855
def check_tokenize(self, s, expected):
18351856
# Format the tokens in s in a table format.
18361857
# The ENDMARKER and final NEWLINE are omitted.
1858+
f = StringIO(s)
18371859
with self.subTest(source=s):
18381860
result = stringify_tokens_from_source(
1839-
_generate_tokens_from_c_tokenizer(s), s
1861+
_generate_tokens_from_c_tokenizer(f.readline), s
18401862
)
18411863
self.assertEqual(result, expected.rstrip().splitlines())
18421864

1865+
def test_encoding(self):
1866+
def readline(encoding):
1867+
yield "1+1".encode(encoding)
1868+
1869+
expected = [
1870+
TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'),
1871+
TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'),
1872+
TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'),
1873+
TokenInfo(type=NEWLINE, string='\n', start=(1, 3), end=(1, 4), line='1+1\n'),
1874+
TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
1875+
]
1876+
for encoding in ["utf-8", "latin-1", "utf-16"]:
1877+
with self.subTest(encoding=encoding):
1878+
tokens = list(_generate_tokens_from_c_tokenizer(
1879+
readline(encoding).__next__,
1880+
extra_tokens=True,
1881+
encoding=encoding,
1882+
))
1883+
self.assertEqual(tokens, expected)
1884+
18431885
def test_int(self):
18441886

18451887
self.check_tokenize('0xff <= 255', """\
@@ -2675,43 +2717,44 @@ def test_unicode(self):
26752717

26762718
def test_invalid_syntax(self):
26772719
def get_tokens(string):
2678-
return list(_generate_tokens_from_c_tokenizer(string))
2679-
2680-
self.assertRaises(SyntaxError, get_tokens, "(1+2]")
2681-
self.assertRaises(SyntaxError, get_tokens, "(1+2}")
2682-
self.assertRaises(SyntaxError, get_tokens, "{1+2]")
2683-
2684-
self.assertRaises(SyntaxError, get_tokens, "1_")
2685-
self.assertRaises(SyntaxError, get_tokens, "1.2_")
2686-
self.assertRaises(SyntaxError, get_tokens, "1e2_")
2687-
self.assertRaises(SyntaxError, get_tokens, "1e+")
2688-
2689-
self.assertRaises(SyntaxError, get_tokens, "\xa0")
2690-
self.assertRaises(SyntaxError, get_tokens, "€")
2691-
2692-
self.assertRaises(SyntaxError, get_tokens, "0b12")
2693-
self.assertRaises(SyntaxError, get_tokens, "0b1_2")
2694-
self.assertRaises(SyntaxError, get_tokens, "0b2")
2695-
self.assertRaises(SyntaxError, get_tokens, "0b1_")
2696-
self.assertRaises(SyntaxError, get_tokens, "0b")
2697-
self.assertRaises(SyntaxError, get_tokens, "0o18")
2698-
self.assertRaises(SyntaxError, get_tokens, "0o1_8")
2699-
self.assertRaises(SyntaxError, get_tokens, "0o8")
2700-
self.assertRaises(SyntaxError, get_tokens, "0o1_")
2701-
self.assertRaises(SyntaxError, get_tokens, "0o")
2702-
self.assertRaises(SyntaxError, get_tokens, "0x1_")
2703-
self.assertRaises(SyntaxError, get_tokens, "0x")
2704-
self.assertRaises(SyntaxError, get_tokens, "1_")
2705-
self.assertRaises(SyntaxError, get_tokens, "012")
2706-
self.assertRaises(SyntaxError, get_tokens, "1.2_")
2707-
self.assertRaises(SyntaxError, get_tokens, "1e2_")
2708-
self.assertRaises(SyntaxError, get_tokens, "1e+")
2709-
2710-
self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
2711-
self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
2712-
2713-
self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
2714-
self.assertRaises(SyntaxError, get_tokens, "]")
2720+
the_string = StringIO(string)
2721+
return list(_generate_tokens_from_c_tokenizer(the_string.readline))
2722+
2723+
for case in [
2724+
"(1+2]",
2725+
"(1+2}",
2726+
"{1+2]",
2727+
"1_",
2728+
"1.2_",
2729+
"1e2_",
2730+
"1e+",
2731+
2732+
"\xa0",
2733+
"€",
2734+
"0b12",
2735+
"0b1_2",
2736+
"0b2",
2737+
"0b1_",
2738+
"0b",
2739+
"0o18",
2740+
"0o1_8",
2741+
"0o8",
2742+
"0o1_",
2743+
"0o",
2744+
"0x1_",
2745+
"0x",
2746+
"1_",
2747+
"012",
2748+
"1.2_",
2749+
"1e2_",
2750+
"1e+",
2751+
"'sdfsdf",
2752+
"'''sdfsdf''",
2753+
"("*1000+"a"+")"*1000,
2754+
"]",
2755+
]:
2756+
with self.subTest(case=case):
2757+
self.assertRaises(SyntaxError, get_tokens, case)
27152758

27162759
def test_max_indent(self):
27172760
MAXINDENT = 100
@@ -2722,20 +2765,24 @@ def generate_source(indents):
27222765
return source
27232766

27242767
valid = generate_source(MAXINDENT - 1)
2725-
tokens = list(_generate_tokens_from_c_tokenizer(valid))
2768+
the_input = StringIO(valid)
2769+
tokens = list(_generate_tokens_from_c_tokenizer(the_input.readline))
27262770
self.assertEqual(tokens[-2].type, DEDENT)
27272771
self.assertEqual(tokens[-1].type, ENDMARKER)
27282772
compile(valid, "<string>", "exec")
27292773

27302774
invalid = generate_source(MAXINDENT)
2731-
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(invalid)))
2775+
the_input = StringIO(invalid)
2776+
self.assertRaises(SyntaxError, lambda: list(_generate_tokens_from_c_tokenizer(the_input.readline)))
27322777
self.assertRaises(
27332778
IndentationError, compile, invalid, "<string>", "exec"
27342779
)
27352780

27362781
def test_continuation_lines_indentation(self):
27372782
def get_tokens(string):
2738-
return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
2783+
the_string = StringIO(string)
2784+
return [(kind, string) for (kind, string, *_)
2785+
in _generate_tokens_from_c_tokenizer(the_string.readline)]
27392786

27402787
code = dedent("""
27412788
def fib(n):

Lib/tokenize.py

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import sys
3535
from token import *
3636
from token import EXACT_TOKEN_TYPES
37+
import _tokenize
3738

3839
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
3940
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -443,29 +444,15 @@ def tokenize(readline):
443444
# BOM will already have been stripped.
444445
encoding = "utf-8"
445446
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
446-
yield from _tokenize(rl_gen, encoding)
447-
448-
def _tokenize(rl_gen, encoding):
449-
source = b"".join(rl_gen).decode(encoding)
450-
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
451-
yield token
447+
yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
452448

453449
def generate_tokens(readline):
454450
"""Tokenize a source reading Python code as unicode strings.
455451
456452
This has the same API as tokenize(), except that it expects the *readline*
457453
callable to return str objects instead of bytes.
458454
"""
459-
def _gen():
460-
while True:
461-
try:
462-
line = readline()
463-
except StopIteration:
464-
return
465-
if not line:
466-
return
467-
yield line.encode()
468-
return _tokenize(_gen(), 'utf-8')
455+
return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
469456

470457
def main():
471458
import argparse
@@ -502,9 +489,9 @@ def error(message, filename=None, location=None):
502489
tokens = list(tokenize(f.readline))
503490
else:
504491
filename = "<stdin>"
505-
tokens = _tokenize(
492+
tokens = _generate_tokens_from_c_tokenizer(
506493
(x.encode('utf-8') for x in iter(sys.stdin.readline, "")
507-
), "utf-8")
494+
), "utf-8", extra_tokens=True)
508495

509496

510497
# Output the tokenization
@@ -531,10 +518,13 @@ def error(message, filename=None, location=None):
531518
perror("unexpected error: %s" % err)
532519
raise
533520

534-
def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
521+
def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
535522
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
536-
import _tokenize as c_tokenizer
537-
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
523+
if encoding is None:
524+
it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
525+
else:
526+
it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
527+
for info in it:
538528
yield TokenInfo._make(info)
539529

540530

0 commit comments

Comments
 (0)