Skip to content

Commit a135a6d

Browse files
authored
gh-112943: Correctly compute end offsets for multiline tokens in the tokenize module (#112949)
1 parent 4c5b9c1 commit a135a6d

File tree

5 files changed

+25
-6
lines changed

5 files changed

+25
-6
lines changed

Lib/test/test_tokenize.py

+10
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,16 @@ def test_string(self):
615615
OP '}' (3, 0) (3, 1)
616616
FSTRING_MIDDLE '__' (3, 1) (3, 3)
617617
FSTRING_END "'" (3, 3) (3, 4)
618+
""")
619+
620+
self.check_tokenize("""\
621+
'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli
622+
aktualni pracownicy, obecni pracownicy'''
623+
""", """\
624+
INDENT ' ' (1, 0) (1, 4)
625+
STRING "'''Autorzy, którzy tą jednostkę mają wpisani jako AKTUALNA -- czyli\\n aktualni pracownicy, obecni pracownicy'''" (1, 4) (2, 45)
626+
NEWLINE '\\n' (2, 45) (2, 46)
627+
DEDENT '' (3, 0) (3, 0)
618628
""")
619629

620630
def test_function(self):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Correctly compute end column offsets for multiline tokens in the
2+
:mod:`tokenize` module. Patch by Pablo Galindo

Parser/pegen.c

+11-5
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,8 @@ _PyPegen_interactive_exit(Parser *p)
1919
}
2020

2121
Py_ssize_t
22-
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
22+
_PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
2323
{
24-
const char *str = PyUnicode_AsUTF8(line);
25-
if (!str) {
26-
return -1;
27-
}
2824
Py_ssize_t len = strlen(str);
2925
if (col_offset > len + 1) {
3026
col_offset = len + 1;
@@ -39,6 +35,16 @@ _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
3935
return size;
4036
}
4137

38+
Py_ssize_t
39+
_PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
40+
{
41+
const char *str = PyUnicode_AsUTF8(line);
42+
if (!str) {
43+
return -1;
44+
}
45+
return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
46+
}
47+
4248
// Here, mark is the start of the node, while p->mark is the end.
4349
// If node==NULL, they should be the same.
4450
int

Parser/pegen.h

+1
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ expr_ty _PyPegen_name_token(Parser *p);
149149
expr_ty _PyPegen_number_token(Parser *p);
150150
void *_PyPegen_string_token(Parser *p);
151151
Py_ssize_t _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset);
152+
Py_ssize_t _PyPegen_byte_offset_to_character_offset_raw(const char*, Py_ssize_t col_offset);
152153

153154
// Error handling functions and APIs
154155
typedef enum {

Python/Python-tokenize.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ tokenizeriter_next(tokenizeriterobject *it)
225225
col_offset = _PyPegen_byte_offset_to_character_offset(line, token.start - line_start);
226226
}
227227
if (token.end != NULL && token.end >= it->tok->line_start) {
228-
end_col_offset = _PyPegen_byte_offset_to_character_offset(line, token.end - it->tok->line_start);
228+
end_col_offset = _PyPegen_byte_offset_to_character_offset_raw(it->tok->line_start, token.end - it->tok->line_start);
229229
}
230230

231231
if (it->tok->tok_extra_tokens) {

0 commit comments

Comments
 (0)