Skip to content

Commit 7ad793e

Browse files
authored
gh-125553: Fix backslash continuation in untokenize (#126010)
1 parent a4760ef commit 7ad793e

File tree

3 files changed

+49
-6
lines changed

3 files changed

+49
-6
lines changed

Lib/test/test_tokenize.py

+27
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import re
23
import token
34
import tokenize
45
import unittest
@@ -1819,6 +1820,22 @@ def test_iter_compat(self):
18191820
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
18201821

18211822

1823+
def contains_ambiguous_backslash(source):
1824+
"""Return `True` if the source contains a backslash on a
1825+
line by itself. For example:
1826+
1827+
a = (1
1828+
\\
1829+
)
1830+
1831+
Code like this cannot be untokenized exactly. This is because
1832+
the tokenizer does not produce any tokens for the line containing
1833+
the backslash and so there is no way to know its indent.
1834+
"""
1835+
pattern = re.compile(br'\n\s*\\\r?\n')
1836+
return pattern.search(source) is not None
1837+
1838+
18221839
class TestRoundtrip(TestCase):
18231840

18241841
def check_roundtrip(self, f):
@@ -1829,6 +1846,9 @@ def check_roundtrip(self, f):
18291846
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
18301847
The test fails if the 3 pair tokenizations do not match.
18311848
1849+
If the source code can be untokenized unambiguously, the
1850+
untokenized code must match the original code exactly.
1851+
18321852
When untokenize bugs are fixed, untokenize with 5-tuples should
18331853
reproduce code that does not contain a backslash continuation
18341854
following spaces. A proper test should test this.
@@ -1852,6 +1872,13 @@ def check_roundtrip(self, f):
18521872
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
18531873
self.assertEqual(tokens2_from5, tokens2)
18541874

1875+
if not contains_ambiguous_backslash(code):
1876+
# The BOM does not produce a token so there is no way to preserve it.
1877+
code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
1878+
readline = iter(code_without_bom.splitlines(keepends=True)).__next__
1879+
untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
1880+
self.assertEqual(code_without_bom, untokenized_code)
1881+
18551882
def check_line_extraction(self, f):
18561883
if isinstance(f, str):
18571884
code = f.encode('utf-8')

Lib/tokenize.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -169,21 +169,36 @@ def __init__(self):
169169
self.prev_row = 1
170170
self.prev_col = 0
171171
self.prev_type = None
172+
self.prev_line = ""
172173
self.encoding = None
173174

174175
def add_whitespace(self, start):
175176
row, col = start
176177
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
177178
raise ValueError("start ({},{}) precedes previous end ({},{})"
178179
.format(row, col, self.prev_row, self.prev_col))
179-
row_offset = row - self.prev_row
180-
if row_offset:
181-
self.tokens.append("\\\n" * row_offset)
182-
self.prev_col = 0
180+
self.add_backslash_continuation(start)
183181
col_offset = col - self.prev_col
184182
if col_offset:
185183
self.tokens.append(" " * col_offset)
186184

185+
def add_backslash_continuation(self, start):
186+
"""Add backslash continuation characters if the row has increased
187+
without encountering a newline token.
188+
189+
This also inserts the correct amount of whitespace before the backslash.
190+
"""
191+
row = start[0]
192+
row_offset = row - self.prev_row
193+
if row_offset == 0:
194+
return
195+
196+
newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
197+
line = self.prev_line.rstrip('\\\r\n')
198+
ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
199+
self.tokens.append(ws + f"\\{newline}" * row_offset)
200+
self.prev_col = 0
201+
187202
def escape_brackets(self, token):
188203
characters = []
189204
consume_until_next_bracket = False
@@ -243,8 +258,6 @@ def untokenize(self, iterable):
243258
end_line, end_col = end
244259
extra_chars = last_line.count("{{") + last_line.count("}}")
245260
end = (end_line, end_col + extra_chars)
246-
elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
247-
self.tokens.append(" ")
248261

249262
self.add_whitespace(start)
250263
self.tokens.append(token)
@@ -253,6 +266,7 @@ def untokenize(self, iterable):
253266
self.prev_row += 1
254267
self.prev_col = 0
255268
self.prev_type = tok_type
269+
self.prev_line = line
256270
return "".join(self.tokens)
257271

258272
def compat(self, token, iterable):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix round-trip invariance for backslash continuations in
2+
:func:`tokenize.untokenize`.

0 commit comments

Comments
 (0)