@@ -32,6 +32,11 @@ typedef struct
32
32
{
33
33
PyObject_HEAD struct tok_state * tok ;
34
34
int done ;
35
+
36
+ /* Needed to cache line for performance */
37
+ PyObject * last_line ;
38
+ Py_ssize_t last_lineno ;
39
+ Py_ssize_t byte_col_offset_diff ;
35
40
} tokenizeriterobject ;
36
41
37
42
/*[clinic input]
@@ -68,6 +73,11 @@ tokenizeriter_new_impl(PyTypeObject *type, PyObject *readline,
68
73
self -> tok -> tok_extra_tokens = 1 ;
69
74
}
70
75
self -> done = 0 ;
76
+
77
+ self -> last_line = NULL ;
78
+ self -> byte_col_offset_diff = 0 ;
79
+ self -> last_lineno = 0 ;
80
+
71
81
return (PyObject * )self ;
72
82
}
73
83
@@ -210,7 +220,18 @@ tokenizeriter_next(tokenizeriterobject *it)
210
220
if (size >= 1 && it -> tok -> implicit_newline ) {
211
221
size -= 1 ;
212
222
}
213
- line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
223
+
224
+ if (it -> tok -> lineno != it -> last_lineno ) {
225
+ // Line has changed since last token, so we fetch the new line and cache it
226
+ // in the iter object.
227
+ Py_XDECREF (it -> last_line );
228
+ line = PyUnicode_DecodeUTF8 (line_start , size , "replace" );
229
+ it -> last_line = line ;
230
+ it -> byte_col_offset_diff = 0 ;
231
+ } else {
232
+ // Line hasn't changed so we reuse the cached one.
233
+ line = it -> last_line ;
234
+ }
214
235
}
215
236
if (line == NULL ) {
216
237
Py_DECREF (str );
@@ -219,13 +240,28 @@ tokenizeriter_next(tokenizeriterobject *it)
219
240
220
241
Py_ssize_t lineno = ISSTRINGLIT (type ) ? it -> tok -> first_lineno : it -> tok -> lineno ;
221
242
Py_ssize_t end_lineno = it -> tok -> lineno ;
243
+ it -> last_lineno = lineno ;
244
+
222
245
Py_ssize_t col_offset = -1 ;
223
246
Py_ssize_t end_col_offset = -1 ;
247
+ Py_ssize_t byte_offset = -1 ;
224
248
if (token .start != NULL && token .start >= line_start ) {
225
- col_offset = _PyPegen_byte_offset_to_character_offset (line , token .start - line_start );
249
+ byte_offset = token .start - line_start ;
250
+ col_offset = byte_offset - it -> byte_col_offset_diff ;
226
251
}
227
252
if (token .end != NULL && token .end >= it -> tok -> line_start ) {
228
- end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , token .end - it -> tok -> line_start );
253
+ Py_ssize_t end_byte_offset = token .end - it -> tok -> line_start ;
254
+ if (lineno == end_lineno ) {
255
+ // If the whole token is at the same line, we can just use the token.start
256
+ // buffer for figuring out the new column offset, since using line is not
257
+ // performant for very long lines.
258
+ Py_ssize_t token_col_offset = _PyPegen_byte_offset_to_character_offset_line (line , byte_offset , end_byte_offset );
259
+ end_col_offset = col_offset + token_col_offset ;
260
+ it -> byte_col_offset_diff += token .end - token .start - token_col_offset ;
261
+ } else {
262
+ end_col_offset = _PyPegen_byte_offset_to_character_offset_raw (it -> tok -> line_start , end_byte_offset );
263
+ it -> byte_col_offset_diff += end_byte_offset - end_col_offset ;
264
+ }
229
265
}
230
266
231
267
if (it -> tok -> tok_extra_tokens ) {
@@ -262,7 +298,7 @@ tokenizeriter_next(tokenizeriterobject *it)
262
298
}
263
299
}
264
300
265
- result = Py_BuildValue ("(iN(nn)(nn)N )" , type , str , lineno , col_offset , end_lineno , end_col_offset , line );
301
+ result = Py_BuildValue ("(iN(nn)(nn)O )" , type , str , lineno , col_offset , end_lineno , end_col_offset , line );
266
302
exit :
267
303
_PyToken_Free (& token );
268
304
if (type == ENDMARKER ) {
0 commit comments