@@ -48,11 +48,10 @@ typedef struct _PyEncoderObject {
48
48
PyObject * indent ;
49
49
PyObject * key_separator ;
50
50
PyObject * item_separator ;
51
+ int (* fast_encode )(PyUnicodeWriter * , PyObject * );
51
52
bool sort_keys ;
52
53
bool skipkeys ;
53
54
bool allow_nan ;
54
- bool fast_encode ;
55
- bool ensure_ascii ; /* used only when fast_encode == true */
56
55
} PyEncoderObject ;
57
56
58
57
#define PyEncoderObject_CAST (op ) ((PyEncoderObject *)(op))
@@ -304,18 +303,20 @@ escape_unicode(PyObject *pystr)
304
303
return rval ;
305
304
}
306
305
307
- // Take a PyUnicode pystr and write an escaped string to writer.
306
+ #define ESCAPE_BUF_SIZE 200
307
+
308
+ // Take a PyUnicode pystr and write an escaped string to writer. (ensure_ascii)
308
309
static int
309
- write_escaped_unicode (PyUnicodeWriter * writer , PyObject * pystr , bool ascii_only )
310
+ write_escaped_ascii (PyUnicodeWriter * writer , PyObject * pystr )
310
311
{
311
312
Py_ssize_t i ;
312
313
Py_ssize_t input_chars ;
313
- Py_ssize_t chars ;
314
- Py_ssize_t copy_len = 0 ;
314
+ Py_ssize_t buf_len ;
315
315
const void * input ;
316
+ Py_UCS4 c = 0 ;
316
317
int kind ;
317
318
int ret ;
318
- unsigned char buf [12 ];
319
+ char buf [ESCAPE_BUF_SIZE ]; // avoid overhead of PyUnicodeWriter APIs
319
320
320
321
input_chars = PyUnicode_GET_LENGTH (pystr );
321
322
input = PyUnicode_DATA (pystr );
@@ -324,27 +325,102 @@ write_escaped_unicode(PyUnicodeWriter *writer, PyObject *pystr, bool ascii_only)
324
325
ret = PyUnicodeWriter_WriteChar (writer , '"' );
325
326
if (ret ) return ret ;
326
327
328
+ // Fast path for string doesn't need escape at all: e.g. "id", "name"
327
329
for (i = 0 ; i < input_chars ; i ++ ) {
330
+ c = PyUnicode_READ (kind , input , i );
331
+ if (!S_CHAR (c )) {
332
+ break ;
333
+ }
334
+ }
335
+ if (i > 0 ) {
336
+ ret = PyUnicodeWriter_WriteSubstring (writer , pystr , 0 , i );
337
+ if (ret ) return ret ;
338
+ }
339
+ if (i == input_chars ) {
340
+ return PyUnicodeWriter_WriteChar (writer , '"' );
341
+ }
342
+
343
+ buf_len = ascii_escape_unichar (c , (unsigned char * )buf , 0 );
344
+
345
+ for (i ++ ; i < input_chars ; i ++ ) {
328
346
Py_UCS4 c = PyUnicode_READ (kind , input , i );
329
- if (c <= 0x1f || c == '\\' || c == '"' || (ascii_only && c >= 0x7f )) {
330
- ret = PyUnicodeWriter_WriteSubstring (writer , pystr , i - copy_len , i );
331
- if (ret ) return ret ;
332
- copy_len = 0 ;
347
+ if (S_CHAR (c )) {
348
+ buf [buf_len ++ ] = c ;
349
+ }
350
+ else {
351
+ buf_len = ascii_escape_unichar (c , (unsigned char * )buf , buf_len );
352
+ }
333
353
334
- chars = ascii_escape_unichar ( c , buf , 0 );
335
- ret = PyUnicodeWriter_WriteUTF8 (writer , ( const char * ) buf , chars );
354
+ if ( buf_len + 12 > ESCAPE_BUF_SIZE ) {
355
+ ret = PyUnicodeWriter_WriteUTF8 (writer , buf , buf_len );
336
356
if (ret ) return ret ;
357
+ buf_len = 0 ;
358
+ }
359
+ }
360
+
361
+ assert (buf_len < ESCAPE_BUF_SIZE );
362
+ buf [buf_len ++ ] = '"' ;
363
+ return PyUnicodeWriter_WriteUTF8 (writer , buf , buf_len );
364
+ }
365
+
366
+ static int
367
+ write_escaped_unicode (PyUnicodeWriter * writer , PyObject * pystr )
368
+ {
369
+ Py_ssize_t i ;
370
+ Py_ssize_t input_size ;
371
+ Py_ssize_t buf_len ;
372
+ const unsigned char * input ;
373
+ int ret ;
374
+ unsigned char c ;
375
+ char buf [ESCAPE_BUF_SIZE ];
376
+
377
+ // We don't need to escape non-ASCII chars.
378
+ // So we just copy UTF-8 from pystr to buf.
379
+ input = (const unsigned char * ) PyUnicode_AsUTF8AndSize (pystr , & input_size );
380
+
381
+ ret = PyUnicodeWriter_WriteChar (writer , '"' );
382
+ if (ret ) return ret ;
383
+
384
+ // Fast path for string doesn't need escape at all: e.g. "id", "name"
385
+ for (i = 0 ; i < input_size ; i ++ ) {
386
+ c = input [i ];
387
+ if (c <= 0x1f || c == '\\' || c == '"' ) {
388
+ break ;
389
+ }
390
+ }
391
+ if (i > 0 ) {
392
+ ret = PyUnicodeWriter_WriteUTF8 (writer , (const char * )input , i );
393
+ if (ret ) return ret ;
394
+ }
395
+ if (i == input_size ) {
396
+ return PyUnicodeWriter_WriteChar (writer , '"' );
397
+ }
398
+
399
+ buf_len = ascii_escape_unichar (c , (unsigned char * )buf , 0 );
400
+
401
+ for (i ++ ; i < input_size ; i ++ ) {
402
+ c = input [i ];
403
+ if (c <= 0x1f || c == '\\' || c == '"' ) {
404
+ buf_len = ascii_escape_unichar (c , (unsigned char * )buf , buf_len );
337
405
}
338
406
else {
339
- copy_len ++ ;
407
+ buf [buf_len ++ ] = c ;
408
+ }
409
+
410
+ if (buf_len + 6 > ESCAPE_BUF_SIZE ) {
411
+ ret = PyUnicodeWriter_WriteUTF8 (writer , buf , buf_len );
412
+ if (ret ) return ret ;
413
+ buf_len = 0 ;
340
414
}
341
415
}
342
416
343
- ret = PyUnicodeWriter_WriteSubstring ( writer , pystr , i - copy_len , i );
344
- if ( ret ) return ret ;
345
- return PyUnicodeWriter_WriteChar (writer , '"' );
417
+ assert ( buf_len < ESCAPE_BUF_SIZE );
418
+ buf [ buf_len ++ ] = '"' ;
419
+ return PyUnicodeWriter_WriteUTF8 (writer , buf , buf_len );
346
420
}
347
421
422
+ #undef ESCAPE_BUF_SIZE
423
+
348
424
static void
349
425
raise_errmsg (const char * msg , PyObject * s , Py_ssize_t end )
350
426
{
@@ -1293,17 +1369,15 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
1293
1369
s -> sort_keys = sort_keys ;
1294
1370
s -> skipkeys = skipkeys ;
1295
1371
s -> allow_nan = allow_nan ;
1296
- s -> fast_encode = false;
1297
- s -> ensure_ascii = false;
1372
+ s -> fast_encode = NULL ;
1298
1373
1299
1374
if (PyCFunction_Check (s -> encoder )) {
1300
1375
PyCFunction f = PyCFunction_GetFunction (s -> encoder );
1301
1376
if (f == py_encode_basestring_ascii ){
1302
- s -> fast_encode = true;
1303
- s -> ensure_ascii = true;
1377
+ s -> fast_encode = write_escaped_ascii ;
1304
1378
}
1305
1379
else if (f == py_encode_basestring ) {
1306
- s -> fast_encode = true ;
1380
+ s -> fast_encode = write_escaped_unicode ;
1307
1381
}
1308
1382
}
1309
1383
@@ -1497,7 +1571,7 @@ static int
1497
1571
encoder_write_string (PyEncoderObject * s , PyUnicodeWriter * writer , PyObject * obj )
1498
1572
{
1499
1573
if (s -> fast_encode ) {
1500
- return write_escaped_unicode (writer , obj , s -> ensure_ascii );
1574
+ return s -> fast_encode (writer , obj );
1501
1575
}
1502
1576
1503
1577
/* Return the JSON representation of a string */
0 commit comments