diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b9574d30..27a40bdd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,26 +14,30 @@ jobs: host: needs: ruby-versions - name: ${{ matrix.os }} ${{ matrix.ruby }} + name: ${{ matrix.os }} ${{ matrix.ruby }} ${{ matrix.env }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: - - ubuntu-latest - - macos-14 - - windows-latest + - ubuntu-latest + - macos-14 + - windows-latest ruby: ${{ fromJson(needs.ruby-versions.outputs.versions) }} + env: + - "" include: - - { os: ubuntu-24.04-arm, ruby: 3.4 } - - { os: macos-13, ruby: 3.4 } - - { os: windows-latest , ruby: mswin } # ruby/ruby windows CI - - { os: ubuntu-latest , ruby: jruby-9.4 } # Ruby 3.1 - - { os: macos-latest , ruby: truffleruby-head } - - { os: ubuntu-latest , ruby: truffleruby-head } + - { os: ubuntu-24.04-arm, ruby: 3.4 } + - { os: ubuntu-latest , ruby: 3.4, env: "JSON_DISABLE_SIMD=1" } + - { os: ubuntu-latest , ruby: 3.4, env: "JSON_DEBUG=1" } + - { os: macos-13, ruby: 3.4 } + - { os: windows-latest , ruby: mswin } # ruby/ruby windows CI + - { os: ubuntu-latest , ruby: jruby-9.4 } # Ruby 3.1 + - { os: macos-latest , ruby: truffleruby-head } + - { os: ubuntu-latest , ruby: truffleruby-head } exclude: - - { os: windows-latest, ruby: jruby } - - { os: windows-latest, ruby: jruby-head } + - { os: windows-latest, ruby: jruby } + - { os: windows-latest, ruby: jruby-head } steps: - uses: actions/checkout@v4 @@ -49,9 +53,9 @@ jobs: bundle config --without benchmark bundle install - - run: rake compile + - run: rake compile ${{ matrix.env }} - - run: rake test JSON_COMPACT=1 + - run: rake test JSON_COMPACT=1 ${{ matrix.env }} - run: rake build diff --git a/CHANGES.md b/CHANGES.md index 26fed9e4..2208bd7d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,35 @@ # Changes +### Unreleased + +* Add bnew `allow_duplicate_key` parsing options. By default a warning is now emitted when a duplicated key is encountered. + In `json 3.0` an error will be raised. + +### 2025-05-23 (2.12.2) + +* Fix compiler optimization level. + +### 2025-05-23 (2.12.1) + +* Fix a potential crash in large negative floating point number generation. +* Fix for JSON.pretty_generate to use passed state object's generate instead of state class as the required parameters aren't available. + +### 2025-05-12 (2.12.0) + +* Improve floating point generation to not use scientific notation as much. +* Include line and column in parser errors. Both in the message and as exception attributes. +* Handle non-string hash keys with broken `to_s` implementations. +* `JSON.generate` now uses SSE2 (x86) or NEON (arm64) instructions when available to escape strings. + +### 2025-04-25 (2.11.3) + +* Fix a regression in `JSON.pretty_generate` that could cause indentation to be off once some `#to_json` has been called. + +### 2025-04-24 (2.11.2) + +* Add back `JSON::PRETTY_STATE_PROTOTYPE`. This constant was private API but is used by popular gems like `multi_json`. + It now emits a deprecation warning. + ### 2025-04-24 (2.11.1) * Add back `JSON.restore`, `JSON.unparse`, `JSON.fast_unparse` and `JSON.pretty_unparse`. diff --git a/README.md b/README.md index d327f74a..11932721 100644 --- a/README.md +++ b/README.md @@ -233,6 +233,19 @@ the `pp` library's `pp` methods. ## Development +### Prerequisites + +1. Clone the repository +2. Install dependencies with `bundle install` + +### Testing + +The full test suite can be run with: + +```bash +bundle exec rake test +``` + ### Release Update the `lib/json/version.rb` file. diff --git a/ext/json/ext/fbuffer/fbuffer.h b/ext/json/ext/fbuffer/fbuffer.h index b8a4e983..d3237147 100644 --- a/ext/json/ext/fbuffer/fbuffer.h +++ b/ext/json/ext/fbuffer/fbuffer.h @@ -36,6 +36,12 @@ typedef unsigned char _Bool; # define MAYBE_UNUSED(x) x #endif +#ifdef RUBY_DEBUG +#ifndef JSON_DEBUG +#define JSON_DEBUG RUBY_DEBUG +#endif +#endif + enum fbuffer_type { FBUFFER_HEAP_ALLOCATED = 0, FBUFFER_STACK_ALLOCATED = 1, @@ -46,6 +52,9 @@ typedef struct FBufferStruct { unsigned long initial_length; unsigned long len; unsigned long capa; +#ifdef JSON_DEBUG + unsigned long requested; +#endif char *ptr; VALUE io; } FBuffer; @@ -74,6 +83,20 @@ static void fbuffer_stack_init(FBuffer *fb, unsigned long initial_length, char * fb->ptr = stack_buffer; fb->capa = stack_buffer_size; } +#ifdef JSON_DEBUG + fb->requested = 0; +#endif +} + +static inline void fbuffer_consumed(FBuffer *fb, unsigned long consumed) +{ +#ifdef JSON_DEBUG + if (consumed > fb->requested) { + rb_bug("fbuffer: Out of bound write"); + } + fb->requested = 0; +#endif + fb->len += consumed; } static void fbuffer_free(FBuffer *fb) @@ -137,6 +160,10 @@ static void fbuffer_do_inc_capa(FBuffer *fb, unsigned long requested) static inline void fbuffer_inc_capa(FBuffer *fb, unsigned long requested) { +#ifdef JSON_DEBUG + fb->requested = requested; +#endif + if (RB_UNLIKELY(requested > fb->capa - fb->len)) { fbuffer_do_inc_capa(fb, requested); } @@ -147,15 +174,22 @@ static void fbuffer_append(FBuffer *fb, const char *newstr, unsigned long len) if (len > 0) { fbuffer_inc_capa(fb, len); MEMCPY(fb->ptr + fb->len, newstr, char, len); - fb->len += len; + fbuffer_consumed(fb, len); } } /* Appends a character into a buffer. The buffer needs to have sufficient capacity, via fbuffer_inc_capa(...). */ static inline void fbuffer_append_reserved_char(FBuffer *fb, char chr) { +#ifdef JSON_DEBUG + if (fb->requested < 1) { + rb_bug("fbuffer: unreserved write"); + } + fb->requested--; +#endif + fb->ptr[fb->len] = chr; - fb->len += 1; + fb->len++; } static void fbuffer_append_str(FBuffer *fb, VALUE str) @@ -172,7 +206,7 @@ static inline void fbuffer_append_char(FBuffer *fb, char newchr) { fbuffer_inc_capa(fb, 1); *(fb->ptr + fb->len) = newchr; - fb->len++; + fbuffer_consumed(fb, 1); } static inline char *fbuffer_cursor(FBuffer *fb) @@ -182,7 +216,7 @@ static inline char *fbuffer_cursor(FBuffer *fb) static inline void fbuffer_advance_to(FBuffer *fb, char *end) { - fb->len = end - fb->ptr; + fbuffer_consumed(fb, (end - fb->ptr) - fb->len); } /* diff --git a/ext/json/ext/generator/depend b/ext/json/ext/generator/depend index 60697f2c..14241d7f 100644 --- a/ext/json/ext/generator/depend +++ b/ext/json/ext/generator/depend @@ -1,3 +1,4 @@ generator.o: generator.c $(srcdir)/../fbuffer/fbuffer.h generator.o: generator.c $(srcdir)/../vendor/fpconv.c generator.o: generator.c $(srcdir)/../vendor/jeaiii-ltoa.h +generator.o: generator.c $(srcdir)/../simd/simd.h diff --git a/ext/json/ext/generator/extconf.rb b/ext/json/ext/generator/extconf.rb index 078068cf..fb9afd07 100644 --- a/ext/json/ext/generator/extconf.rb +++ b/ext/json/ext/generator/extconf.rb @@ -6,5 +6,11 @@ else append_cflags("-std=c99") $defs << "-DJSON_GENERATOR" + $defs << "-DJSON_DEBUG" if ENV["JSON_DEBUG"] + + if enable_config('generator-use-simd', default=!ENV["JSON_DISABLE_SIMD"]) + load __dir__ + "/../simd/conf.rb" + end + create_makefile 'json/ext/generator' end diff --git a/ext/json/ext/generator/generator.c b/ext/json/ext/generator/generator.c index 428f5e21..01e8badc 100644 --- a/ext/json/ext/generator/generator.c +++ b/ext/json/ext/generator/generator.c @@ -5,6 +5,8 @@ #include #include +#include "../simd/simd.h" + /* ruby api and some helpers */ typedef struct JSON_Generator_StateStruct { @@ -45,7 +47,7 @@ static VALUE sym_indent, sym_space, sym_space_before, sym_object_nl, sym_array_n struct generate_json_data; -typedef void (*generator_func)(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); +typedef void (*generator_func)(FBuffer *buffer, struct generate_json_data *data, VALUE obj); struct generate_json_data { FBuffer *buffer; @@ -57,20 +59,20 @@ struct generate_json_data { static VALUE cState_from_state_s(VALUE self, VALUE opts); static VALUE cState_partial_generate(VALUE self, VALUE obj, generator_func, VALUE io); -static void generate_json(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_object(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_array(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_null(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_false(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_true(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); +static void generate_json(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_object(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_array(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_null(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_false(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_true(FBuffer *buffer, struct generate_json_data *data, VALUE obj); #ifdef RUBY_INTEGER_UNIFICATION -static void generate_json_integer(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); +static void generate_json_integer(FBuffer *buffer, struct generate_json_data *data, VALUE obj); #endif -static void generate_json_fixnum(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_bignum(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_float(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); -static void generate_json_fragment(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj); +static void generate_json_fixnum(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_bignum(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_float(FBuffer *buffer, struct generate_json_data *data, VALUE obj); +static void generate_json_fragment(FBuffer *buffer, struct generate_json_data *data, VALUE obj); static int usascii_encindex, utf8_encindex, binary_encindex; @@ -109,12 +111,40 @@ typedef struct _search_state { const char *end; const char *cursor; FBuffer *buffer; + +#ifdef HAVE_SIMD + const char *chunk_base; + const char *chunk_end; + bool has_matches; + +#if defined(HAVE_SIMD_NEON) + uint64_t matches_mask; +#elif defined(HAVE_SIMD_SSE2) + int matches_mask; +#else +#error "Unknown SIMD Implementation." +#endif /* HAVE_SIMD_NEON */ +#endif /* HAVE_SIMD */ } search_state; -static inline void search_flush(search_state *search) -{ - fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); - search->cursor = search->ptr; +#if (defined(__GNUC__ ) || defined(__clang__)) +#define FORCE_INLINE __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + +static inline FORCE_INLINE void search_flush(search_state *search) +{ + // Do not remove this conditional without profiling, specifically escape-heavy text. + // escape_UTF8_char_basic will advance search->ptr and search->cursor (effectively a search_flush). + // For back-to-back characters that need to be escaped, specifcally for the SIMD code paths, this method + // will be called just before calling escape_UTF8_char_basic. There will be no characers to append for the + // consecutive characters that need to be escaped. While the fbuffer_append is a no-op if + // nothing needs to be flushed, we can save a few memory references with this conditional. + if (search->ptr > search->cursor) { + fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor); + search->cursor = search->ptr; + } } static const unsigned char escape_table_basic[256] = { @@ -130,6 +160,8 @@ static const unsigned char escape_table_basic[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; +static unsigned char (*search_escape_basic_impl)(search_state *); + static inline unsigned char search_escape_basic(search_state *search) { while (search->ptr < search->end) { @@ -144,7 +176,8 @@ static inline unsigned char search_escape_basic(search_state *search) return 0; } -static inline void escape_UTF8_char_basic(search_state *search) { +static inline FORCE_INLINE void escape_UTF8_char_basic(search_state *search) +{ const unsigned char ch = (unsigned char)*search->ptr; switch (ch) { case '"': fbuffer_append(search->buffer, "\\\"", 2); break; @@ -186,12 +219,13 @@ static inline void escape_UTF8_char_basic(search_state *search) { */ static inline void convert_UTF8_to_JSON(search_state *search) { - while (search_escape_basic(search)) { + while (search_escape_basic_impl(search)) { escape_UTF8_char_basic(search); } } -static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) { +static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) +{ const unsigned char ch = (unsigned char)*search->ptr; switch (ch_len) { case 1: { @@ -227,6 +261,228 @@ static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) search->cursor = (search->ptr += ch_len); } +#ifdef HAVE_SIMD + +static inline FORCE_INLINE char *copy_remaining_bytes(search_state *search, unsigned long vec_len, unsigned long len) +{ + // Flush the buffer so everything up until the last 'len' characters are unflushed. + search_flush(search); + + FBuffer *buf = search->buffer; + fbuffer_inc_capa(buf, vec_len); + + char *s = (buf->ptr + buf->len); + + // Pad the buffer with dummy characters that won't need escaping. + // This seem wateful at first sight, but memset of vector length is very fast. + memset(s, 'X', vec_len); + + // Optimistically copy the remaining 'len' characters to the output FBuffer. If there are no characters + // to escape, then everything ends up in the correct spot. Otherwise it was convenient temporary storage. + MEMCPY(s, search->ptr, char, len); + + return s; +} + +#ifdef HAVE_SIMD_NEON + +static inline FORCE_INLINE unsigned char neon_next_match(search_state *search) +{ + uint64_t mask = search->matches_mask; + uint32_t index = trailing_zeros64(mask) >> 2; + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; +} + +static inline unsigned char search_escape_basic_neon(search_state *search) +{ + if (RB_UNLIKELY(search->has_matches)) { + // There are more matches if search->matches_mask > 0. + if (search->matches_mask > 0) { + return neon_next_match(search); + } else { + // neon_next_match will only advance search->ptr up to the last matching character. + // Skip over any characters in the last chunk that occur after the last match. + search->has_matches = false; + search->ptr = search->chunk_end; + } + } + + /* + * The code below implements an SIMD-based algorithm to determine if N bytes at a time + * need to be escaped. + * + * Assume the ptr = "Te\sting!" (the double quotes are included in the string) + * + * The explanation will be limited to the first 8 bytes of the string for simplicity. However + * the vector insructions may work on larger vectors. + * + * First, we load three constants 'lower_bound', 'backslash' and 'dblquote" in vector registers. + * + * lower_bound: [20 20 20 20 20 20 20 20] + * backslash: [5C 5C 5C 5C 5C 5C 5C 5C] + * dblquote: [22 22 22 22 22 22 22 22] + * + * Next we load the first chunk of the ptr: + * [22 54 65 5C 73 74 69 6E] (" T e \ s t i n) + * + * First we check if any byte in chunk is less than 32 (0x20). This returns the following vector + * as no bytes are less than 32 (0x20): + * [0 0 0 0 0 0 0 0] + * + * Next, we check if any byte in chunk is equal to a backslash: + * [0 0 0 FF 0 0 0 0] + * + * Finally we check if any byte in chunk is equal to a double quote: + * [FF 0 0 0 0 0 0 0] + * + * Now we have three vectors where each byte indicates if the corresponding byte in chunk + * needs to be escaped. We combine these vectors with a series of logical OR instructions. + * This is the needs_escape vector and it is equal to: + * [FF 0 0 FF 0 0 0 0] + * + * Next we compute the bitwise AND between each byte and 0x1 and compute the horizontal sum of + * the values in the vector. This computes how many bytes need to be escaped within this chunk. + * + * Finally we compute a mask that indicates which bytes need to be escaped. If the mask is 0 then, + * no bytes need to be escaped and we can continue to the next chunk. If the mask is not 0 then we + * have at least one byte that needs to be escaped. + */ + + if (string_scan_simd_neon(&search->ptr, search->end, &search->matches_mask)) { + search->has_matches = true; + search->chunk_base = search->ptr; + search->chunk_end = search->ptr + sizeof(uint8x16_t); + return neon_next_match(search); + } + + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= SIMD_MINIMUM_THRESHOLD) { + char *s = copy_remaining_bytes(search, sizeof(uint8x16_t), remaining); + + uint64_t mask = compute_chunk_mask_neon(s); + + if (!mask) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + fbuffer_consumed(search->buffer, remaining); + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + + search->matches_mask = mask; + search->has_matches = true; + search->chunk_end = search->end; + search->chunk_base = search->ptr; + return neon_next_match(search); + } + + if (search->ptr < search->end) { + return search_escape_basic(search); + } + + search_flush(search); + return 0; +} +#endif /* HAVE_SIMD_NEON */ + +#ifdef HAVE_SIMD_SSE2 + +static inline FORCE_INLINE unsigned char sse2_next_match(search_state *search) +{ + int mask = search->matches_mask; + int index = trailing_zeros(mask); + + // It is assumed escape_UTF8_char_basic will only ever increase search->ptr by at most one character. + // If we want to use a similar approach for full escaping we'll need to ensure: + // search->chunk_base + index >= search->ptr + // However, since we know escape_UTF8_char_basic only increases search->ptr by one, if the next match + // is one byte after the previous match then: + // search->chunk_base + index == search->ptr + search->ptr = search->chunk_base + index; + mask &= mask - 1; + search->matches_mask = mask; + search_flush(search); + return 1; +} + +#if defined(__clang__) || defined(__GNUC__) +#define TARGET_SSE2 __attribute__((target("sse2"))) +#else +#define TARGET_SSE2 +#endif + +static inline TARGET_SSE2 FORCE_INLINE unsigned char search_escape_basic_sse2(search_state *search) +{ + if (RB_UNLIKELY(search->has_matches)) { + // There are more matches if search->matches_mask > 0. + if (search->matches_mask > 0) { + return sse2_next_match(search); + } else { + // sse2_next_match will only advance search->ptr up to the last matching character. + // Skip over any characters in the last chunk that occur after the last match. + search->has_matches = false; + if (RB_UNLIKELY(search->chunk_base + sizeof(__m128i) >= search->end)) { + search->ptr = search->end; + } else { + search->ptr = search->chunk_base + sizeof(__m128i); + } + } + } + + if (string_scan_simd_sse2(&search->ptr, search->end, &search->matches_mask)) { + search->has_matches = true; + search->chunk_base = search->ptr; + search->chunk_end = search->ptr + sizeof(__m128i); + return sse2_next_match(search); + } + + // There are fewer than 16 bytes left. + unsigned long remaining = (search->end - search->ptr); + if (remaining >= SIMD_MINIMUM_THRESHOLD) { + char *s = copy_remaining_bytes(search, sizeof(__m128i), remaining); + + int needs_escape_mask = compute_chunk_mask_sse2(s); + + if (needs_escape_mask == 0) { + // Nothing to escape, ensure search_flush doesn't do anything by setting + // search->cursor to search->ptr. + fbuffer_consumed(search->buffer, remaining); + search->ptr = search->end; + search->cursor = search->end; + return 0; + } + + search->has_matches = true; + search->matches_mask = needs_escape_mask; + search->chunk_base = search->ptr; + return sse2_next_match(search); + } + + if (search->ptr < search->end) { + return search_escape_basic(search); + } + + search_flush(search); + return 0; +} + +#endif /* HAVE_SIMD_SSE2 */ + +#endif /* HAVE_SIMD */ + static const unsigned char script_safe_escape_table[256] = { // ASCII Control Characters 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, @@ -789,6 +1045,21 @@ struct hash_foreach_arg { int iter; }; +static VALUE +convert_string_subclass(VALUE key) +{ + VALUE key_to_s = rb_funcall(key, i_to_s, 0); + + if (RB_UNLIKELY(!RB_TYPE_P(key_to_s, T_STRING))) { + VALUE cname = rb_obj_class(key); + rb_raise(rb_eTypeError, + "can't convert %"PRIsVALUE" to %s (%"PRIsVALUE"#%s gives %"PRIsVALUE")", + cname, "String", cname, "to_s", rb_obj_class(key_to_s)); + } + + return key_to_s; +} + static int json_object_i(VALUE key, VALUE val, VALUE _arg) { @@ -802,12 +1073,12 @@ json_object_i(VALUE key, VALUE val, VALUE _arg) int j; if (arg->iter > 0) fbuffer_append_char(buffer, ','); - if (RB_UNLIKELY(state->object_nl)) { - fbuffer_append_str(buffer, state->object_nl); + if (RB_UNLIKELY(data->state->object_nl)) { + fbuffer_append_str(buffer, data->state->object_nl); } - if (RB_UNLIKELY(state->indent)) { + if (RB_UNLIKELY(data->state->indent)) { for (j = 0; j < depth; j++) { - fbuffer_append_str(buffer, state->indent); + fbuffer_append_str(buffer, data->state->indent); } } @@ -817,7 +1088,7 @@ json_object_i(VALUE key, VALUE val, VALUE _arg) if (RB_LIKELY(RBASIC_CLASS(key) == rb_cString)) { key_to_s = key; } else { - key_to_s = rb_funcall(key, i_to_s, 0); + key_to_s = convert_string_subclass(key); } break; case T_SYMBOL: @@ -829,21 +1100,22 @@ json_object_i(VALUE key, VALUE val, VALUE _arg) } if (RB_LIKELY(RBASIC_CLASS(key_to_s) == rb_cString)) { - generate_json_string(buffer, data, state, key_to_s); + generate_json_string(buffer, data, key_to_s); } else { - generate_json(buffer, data, state, key_to_s); + generate_json(buffer, data, key_to_s); } - if (RB_UNLIKELY(state->space_before)) fbuffer_append_str(buffer, state->space_before); + if (RB_UNLIKELY(state->space_before)) fbuffer_append_str(buffer, data->state->space_before); fbuffer_append_char(buffer, ':'); - if (RB_UNLIKELY(state->space)) fbuffer_append_str(buffer, state->space); - generate_json(buffer, data, state, val); + if (RB_UNLIKELY(state->space)) fbuffer_append_str(buffer, data->state->space); + generate_json(buffer, data, val); arg->iter++; return ST_CONTINUE; } -static inline long increase_depth(JSON_Generator_State *state) +static inline long increase_depth(struct generate_json_data *data) { + JSON_Generator_State *state = data->state; long depth = ++state->depth; if (RB_UNLIKELY(depth > state->max_nesting && state->max_nesting)) { rb_raise(eNestingError, "nesting of %ld is too deep", --state->depth); @@ -851,14 +1123,14 @@ static inline long increase_depth(JSON_Generator_State *state) return depth; } -static void generate_json_object(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_object(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { int j; - long depth = increase_depth(state); + long depth = increase_depth(data); if (RHASH_SIZE(obj) == 0) { fbuffer_append(buffer, "{}", 2); - --state->depth; + --data->state->depth; return; } @@ -870,49 +1142,49 @@ static void generate_json_object(FBuffer *buffer, struct generate_json_data *dat }; rb_hash_foreach(obj, json_object_i, (VALUE)&arg); - depth = --state->depth; - if (RB_UNLIKELY(state->object_nl)) { - fbuffer_append_str(buffer, state->object_nl); - if (RB_UNLIKELY(state->indent)) { + depth = --data->state->depth; + if (RB_UNLIKELY(data->state->object_nl)) { + fbuffer_append_str(buffer, data->state->object_nl); + if (RB_UNLIKELY(data->state->indent)) { for (j = 0; j < depth; j++) { - fbuffer_append_str(buffer, state->indent); + fbuffer_append_str(buffer, data->state->indent); } } } fbuffer_append_char(buffer, '}'); } -static void generate_json_array(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_array(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { int i, j; - long depth = increase_depth(state); + long depth = increase_depth(data); if (RARRAY_LEN(obj) == 0) { fbuffer_append(buffer, "[]", 2); - --state->depth; + --data->state->depth; return; } fbuffer_append_char(buffer, '['); - if (RB_UNLIKELY(state->array_nl)) fbuffer_append_str(buffer, state->array_nl); + if (RB_UNLIKELY(data->state->array_nl)) fbuffer_append_str(buffer, data->state->array_nl); for(i = 0; i < RARRAY_LEN(obj); i++) { if (i > 0) { fbuffer_append_char(buffer, ','); - if (RB_UNLIKELY(state->array_nl)) fbuffer_append_str(buffer, state->array_nl); + if (RB_UNLIKELY(data->state->array_nl)) fbuffer_append_str(buffer, data->state->array_nl); } - if (RB_UNLIKELY(state->indent)) { + if (RB_UNLIKELY(data->state->indent)) { for (j = 0; j < depth; j++) { - fbuffer_append_str(buffer, state->indent); + fbuffer_append_str(buffer, data->state->indent); } } - generate_json(buffer, data, state, RARRAY_AREF(obj, i)); + generate_json(buffer, data, RARRAY_AREF(obj, i)); } - state->depth = --depth; - if (RB_UNLIKELY(state->array_nl)) { - fbuffer_append_str(buffer, state->array_nl); - if (RB_UNLIKELY(state->indent)) { + data->state->depth = --depth; + if (RB_UNLIKELY(data->state->array_nl)) { + fbuffer_append_str(buffer, data->state->array_nl); + if (RB_UNLIKELY(data->state->indent)) { for (j = 0; j < depth; j++) { - fbuffer_append_str(buffer, state->indent); + fbuffer_append_str(buffer, data->state->indent); } } } @@ -961,7 +1233,7 @@ static inline VALUE ensure_valid_encoding(VALUE str) return str; } -static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_string(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { obj = ensure_valid_encoding(obj); @@ -974,12 +1246,18 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat search.cursor = search.ptr; search.end = search.ptr + len; +#ifdef HAVE_SIMD + search.matches_mask = 0; + search.has_matches = false; + search.chunk_base = NULL; +#endif /* HAVE_SIMD */ + switch(rb_enc_str_coderange(obj)) { case ENC_CODERANGE_7BIT: case ENC_CODERANGE_VALID: - if (RB_UNLIKELY(state->ascii_only)) { - convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table); - } else if (RB_UNLIKELY(state->script_safe)) { + if (RB_UNLIKELY(data->state->ascii_only)) { + convert_UTF8_to_ASCII_only_JSON(&search, data->state->script_safe ? script_safe_escape_table : ascii_only_escape_table); + } else if (RB_UNLIKELY(data->state->script_safe)) { convert_UTF8_to_script_safe_JSON(&search); } else { convert_UTF8_to_JSON(&search); @@ -992,7 +1270,7 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat fbuffer_append_char(buffer, '"'); } -static void generate_json_fallback(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_fallback(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { VALUE tmp; if (rb_respond_to(obj, i_to_json)) { @@ -1002,68 +1280,68 @@ static void generate_json_fallback(FBuffer *buffer, struct generate_json_data *d } else { tmp = rb_funcall(obj, i_to_s, 0); Check_Type(tmp, T_STRING); - generate_json_string(buffer, data, state, tmp); + generate_json_string(buffer, data, tmp); } } -static inline void generate_json_symbol(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static inline void generate_json_symbol(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { - if (state->strict) { - generate_json_string(buffer, data, state, rb_sym2str(obj)); + if (data->state->strict) { + generate_json_string(buffer, data, rb_sym2str(obj)); } else { - generate_json_fallback(buffer, data, state, obj); + generate_json_fallback(buffer, data, obj); } } -static void generate_json_null(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_null(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { fbuffer_append(buffer, "null", 4); } -static void generate_json_false(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_false(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { fbuffer_append(buffer, "false", 5); } -static void generate_json_true(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_true(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { fbuffer_append(buffer, "true", 4); } -static void generate_json_fixnum(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_fixnum(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { fbuffer_append_long(buffer, FIX2LONG(obj)); } -static void generate_json_bignum(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_bignum(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { VALUE tmp = rb_funcall(obj, i_to_s, 0); fbuffer_append_str(buffer, tmp); } #ifdef RUBY_INTEGER_UNIFICATION -static void generate_json_integer(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_integer(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { if (FIXNUM_P(obj)) - generate_json_fixnum(buffer, data, state, obj); + generate_json_fixnum(buffer, data, obj); else - generate_json_bignum(buffer, data, state, obj); + generate_json_bignum(buffer, data, obj); } #endif -static void generate_json_float(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_float(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { double value = RFLOAT_VALUE(obj); - char allow_nan = state->allow_nan; + char allow_nan = data->state->allow_nan; if (isinf(value) || isnan(value)) { /* for NaN and Infinity values we either raise an error or rely on Float#to_s. */ if (!allow_nan) { - if (state->strict && state->as_json) { - VALUE casted_obj = rb_proc_call_with_block(state->as_json, 1, &obj, Qnil); + if (data->state->strict && data->state->as_json) { + VALUE casted_obj = rb_proc_call_with_block(data->state->as_json, 1, &obj, Qnil); if (casted_obj != obj) { - increase_depth(state); - generate_json(buffer, data, state, casted_obj); - state->depth--; + increase_depth(data); + generate_json(buffer, data, casted_obj); + data->state->depth--; return; } } @@ -1076,43 +1354,42 @@ static void generate_json_float(FBuffer *buffer, struct generate_json_data *data } /* This implementation writes directly into the buffer. We reserve - * the 24 characters that fpconv_dtoa states as its maximum, plus - * 2 more characters for the potential ".0" suffix. + * the 28 characters that fpconv_dtoa states as its maximum. */ - fbuffer_inc_capa(buffer, 26); + fbuffer_inc_capa(buffer, 28); char* d = buffer->ptr + buffer->len; int len = fpconv_dtoa(value, d); /* fpconv_dtoa converts a float to its shortest string representation, * but it adds a ".0" if this is a plain integer. */ - buffer->len += len; + fbuffer_consumed(buffer, len); } -static void generate_json_fragment(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json_fragment(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { VALUE fragment = RSTRUCT_GET(obj, 0); Check_Type(fragment, T_STRING); fbuffer_append_str(buffer, fragment); } -static void generate_json(FBuffer *buffer, struct generate_json_data *data, JSON_Generator_State *state, VALUE obj) +static void generate_json(FBuffer *buffer, struct generate_json_data *data, VALUE obj) { bool as_json_called = false; start: if (obj == Qnil) { - generate_json_null(buffer, data, state, obj); + generate_json_null(buffer, data, obj); } else if (obj == Qfalse) { - generate_json_false(buffer, data, state, obj); + generate_json_false(buffer, data, obj); } else if (obj == Qtrue) { - generate_json_true(buffer, data, state, obj); + generate_json_true(buffer, data, obj); } else if (RB_SPECIAL_CONST_P(obj)) { if (RB_FIXNUM_P(obj)) { - generate_json_fixnum(buffer, data, state, obj); + generate_json_fixnum(buffer, data, obj); } else if (RB_FLONUM_P(obj)) { - generate_json_float(buffer, data, state, obj); + generate_json_float(buffer, data, obj); } else if (RB_STATIC_SYM_P(obj)) { - generate_json_symbol(buffer, data, state, obj); + generate_json_symbol(buffer, data, obj); } else { goto general; } @@ -1120,43 +1397,43 @@ static void generate_json(FBuffer *buffer, struct generate_json_data *data, JSON VALUE klass = RBASIC_CLASS(obj); switch (RB_BUILTIN_TYPE(obj)) { case T_BIGNUM: - generate_json_bignum(buffer, data, state, obj); + generate_json_bignum(buffer, data, obj); break; case T_HASH: if (klass != rb_cHash) goto general; - generate_json_object(buffer, data, state, obj); + generate_json_object(buffer, data, obj); break; case T_ARRAY: if (klass != rb_cArray) goto general; - generate_json_array(buffer, data, state, obj); + generate_json_array(buffer, data, obj); break; case T_STRING: if (klass != rb_cString) goto general; - generate_json_string(buffer, data, state, obj); + generate_json_string(buffer, data, obj); break; case T_SYMBOL: - generate_json_symbol(buffer, data, state, obj); + generate_json_symbol(buffer, data, obj); break; case T_FLOAT: if (klass != rb_cFloat) goto general; - generate_json_float(buffer, data, state, obj); + generate_json_float(buffer, data, obj); break; case T_STRUCT: if (klass != cFragment) goto general; - generate_json_fragment(buffer, data, state, obj); + generate_json_fragment(buffer, data, obj); break; default: general: - if (state->strict) { - if (RTEST(state->as_json) && !as_json_called) { - obj = rb_proc_call_with_block(state->as_json, 1, &obj, Qnil); + if (data->state->strict) { + if (RTEST(data->state->as_json) && !as_json_called) { + obj = rb_proc_call_with_block(data->state->as_json, 1, &obj, Qnil); as_json_called = true; goto start; } else { raise_generator_error(obj, "%"PRIsVALUE" not allowed in JSON", CLASS_OF(obj)); } } else { - generate_json_fallback(buffer, data, state, obj); + generate_json_fallback(buffer, data, obj); } } } @@ -1166,7 +1443,7 @@ static VALUE generate_json_try(VALUE d) { struct generate_json_data *data = (struct generate_json_data *)d; - data->func(data->buffer, data, data->state, data->obj); + data->func(data->buffer, data, data->obj); return Qnil; } @@ -1837,4 +2114,23 @@ void Init_generator(void) binary_encindex = rb_ascii8bit_encindex(); rb_require("json/ext/generator/state"); + + + switch(find_simd_implementation()) { +#ifdef HAVE_SIMD +#ifdef HAVE_SIMD_NEON + case SIMD_NEON: + search_escape_basic_impl = search_escape_basic_neon; + break; +#endif /* HAVE_SIMD_NEON */ +#ifdef HAVE_SIMD_SSE2 + case SIMD_SSE2: + search_escape_basic_impl = search_escape_basic_sse2; + break; +#endif /* HAVE_SIMD_SSE2 */ +#endif /* HAVE_SIMD */ + default: + search_escape_basic_impl = search_escape_basic; + break; + } } diff --git a/ext/json/ext/parser/depend b/ext/json/ext/parser/depend index c051a244..a1926b7d 100644 --- a/ext/json/ext/parser/depend +++ b/ext/json/ext/parser/depend @@ -1 +1,2 @@ parser.o: parser.c $(srcdir)/../fbuffer/fbuffer.h +parser.o: parser.c $(srcdir)/../simd/simd.h diff --git a/ext/json/ext/parser/extconf.rb b/ext/json/ext/parser/extconf.rb index 09c96377..de5d5758 100644 --- a/ext/json/ext/parser/extconf.rb +++ b/ext/json/ext/parser/extconf.rb @@ -1,11 +1,15 @@ # frozen_string_literal: true require 'mkmf' -have_func("rb_enc_interned_str", "ruby.h") # RUBY_VERSION >= 3.0 +have_func("rb_enc_interned_str", "ruby/encoding.h") # RUBY_VERSION >= 3.0 have_func("rb_hash_new_capa", "ruby.h") # RUBY_VERSION >= 3.2 have_func("rb_hash_bulk_insert", "ruby.h") # Missing on TruffleRuby have_func("strnlen", "string.h") # Missing on Solaris 10 append_cflags("-std=c99") +if enable_config('parser-use-simd', default=!ENV["JSON_DISABLE_SIMD"]) + load __dir__ + "/../simd/conf.rb" +end + create_makefile 'json/ext/parser' diff --git a/ext/json/ext/parser/parser.c b/ext/json/ext/parser/parser.c index f20769a3..9bf24703 100644 --- a/ext/json/ext/parser/parser.c +++ b/ext/json/ext/parser/parser.c @@ -20,6 +20,8 @@ typedef unsigned char _Bool; #endif #endif +#include "../simd/simd.h" + #ifndef RB_UNLIKELY #define RB_UNLIKELY(expr) expr #endif @@ -35,7 +37,7 @@ static ID i_chr, i_aset, i_aref, i_leftshift, i_new, i_try_convert, i_uminus, i_encode; static VALUE sym_max_nesting, sym_allow_nan, sym_allow_trailing_comma, sym_symbolize_names, sym_freeze, - sym_decimal_class, sym_on_load; + sym_decimal_class, sym_on_load, sym_allow_duplicate_key; static int binary_encindex; static int utf8_encindex; @@ -337,73 +339,6 @@ static size_t strnlen(const char *s, size_t maxlen) } #endif -#define PARSE_ERROR_FRAGMENT_LEN 32 -#ifdef RBIMPL_ATTR_NORETURN -RBIMPL_ATTR_NORETURN() -#endif -static void raise_parse_error(const char *format, const char *start) -{ - unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 1]; - - size_t len = start ? strnlen(start, PARSE_ERROR_FRAGMENT_LEN) : 0; - const char *ptr = start; - - if (len == PARSE_ERROR_FRAGMENT_LEN) { - MEMCPY(buffer, start, char, PARSE_ERROR_FRAGMENT_LEN); - - while (buffer[len - 1] >= 0x80 && buffer[len - 1] < 0xC0) { // Is continuation byte - len--; - } - - if (buffer[len - 1] >= 0xC0) { // multibyte character start - len--; - } - - buffer[len] = '\0'; - ptr = (const char *)buffer; - } - - rb_enc_raise(enc_utf8, rb_path2class("JSON::ParserError"), format, ptr); -} - -/* unicode */ - -static const signed char digit_values[256] = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, - -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1 -}; - -static uint32_t unescape_unicode(const unsigned char *p) -{ - signed char b; - uint32_t result = 0; - b = digit_values[p[0]]; - if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[1]]; - if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[2]]; - if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2); - result = (result << 4) | (unsigned char)b; - b = digit_values[p[3]]; - if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2); - result = (result << 4) | (unsigned char)b; - return result; -} - static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) { int len = 1; @@ -430,10 +365,17 @@ static int convert_UTF32_to_UTF8(char *buf, uint32_t ch) return len; } +enum duplicate_key_action { + JSON_DEPRECATED = 0, + JSON_IGNORE, + JSON_RAISE, +}; + typedef struct JSON_ParserStruct { VALUE on_load_proc; VALUE decimal_class; ID decimal_method_id; + enum duplicate_key_action on_duplicate_key; int max_nesting; bool allow_nan; bool allow_trailing_comma; @@ -444,6 +386,7 @@ typedef struct JSON_ParserStruct { typedef struct JSON_ParserStateStruct { VALUE stack_handle; + const char *start; const char *cursor; const char *end; rvalue_stack *stack; @@ -452,6 +395,133 @@ typedef struct JSON_ParserStateStruct { int current_nesting; } JSON_ParserState; +static void cursor_position(JSON_ParserState *state, long *line_out, long *column_out) +{ + const char *cursor = state->cursor; + long column = 0; + long line = 1; + + while (cursor >= state->start) { + if (*cursor-- == '\n') { + break; + } + column++; + } + + while (cursor >= state->start) { + if (*cursor-- == '\n') { + line++; + } + } + *line_out = line; + *column_out = column; +} + +static void emit_parse_warning(const char *message, JSON_ParserState *state) +{ + long line, column; + cursor_position(state, &line, &column); + + rb_warn("%s at line %ld column %ld", message, line, column); +} + +#define PARSE_ERROR_FRAGMENT_LEN 32 +#ifdef RBIMPL_ATTR_NORETURN +RBIMPL_ATTR_NORETURN() +#endif +static void raise_parse_error(const char *format, JSON_ParserState *state) +{ + unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3]; + long line, column; + cursor_position(state, &line, &column); + + const char *ptr = "EOF"; + if (state->cursor && state->cursor < state->end) { + ptr = state->cursor; + size_t len = 0; + while (len < PARSE_ERROR_FRAGMENT_LEN) { + char ch = ptr[len]; + if (!ch || ch == '\n' || ch == ' ' || ch == '\t' || ch == '\r') { + break; + } + len++; + } + + if (len) { + buffer[0] = '\''; + MEMCPY(buffer + 1, ptr, char, len); + + while (buffer[len] >= 0x80 && buffer[len] < 0xC0) { // Is continuation byte + len--; + } + + if (buffer[len] >= 0xC0) { // multibyte character start + len--; + } + + buffer[len + 1] = '\''; + buffer[len + 2] = '\0'; + ptr = (const char *)buffer; + } + } + + VALUE msg = rb_sprintf(format, ptr); + VALUE message = rb_enc_sprintf(enc_utf8, "%s at line %ld column %ld", RSTRING_PTR(msg), line, column); + RB_GC_GUARD(msg); + + VALUE exc = rb_exc_new_str(rb_path2class("JSON::ParserError"), message); + rb_ivar_set(exc, rb_intern("@line"), LONG2NUM(line)); + rb_ivar_set(exc, rb_intern("@column"), LONG2NUM(column)); + rb_exc_raise(exc); +} + +#ifdef RBIMPL_ATTR_NORETURN +RBIMPL_ATTR_NORETURN() +#endif +static void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at) +{ + state->cursor = at; + raise_parse_error(format, state); +} + +/* unicode */ + +static const signed char digit_values[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, + -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1 +}; + +static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p) +{ + signed char b; + uint32_t result = 0; + b = digit_values[p[0]]; + if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); + result = (result << 4) | (unsigned char)b; + b = digit_values[p[1]]; + if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); + result = (result << 4) | (unsigned char)b; + b = digit_values[p[2]]; + if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); + result = (result << 4) | (unsigned char)b; + b = digit_values[p[3]]; + if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2); + result = (result << 4) | (unsigned char)b; + return result; +} + #define GET_PARSER_CONFIG \ JSON_ParserConfig *config; \ TypedData_Get_Struct(self, JSON_ParserConfig, &JSON_ParserConfig_type, config) @@ -485,8 +555,7 @@ json_eat_comments(JSON_ParserState *state) while (true) { state->cursor = memchr(state->cursor, '*', state->end - state->cursor); if (!state->cursor) { - state->cursor = state->end; - raise_parse_error("unexpected end of input, expected closing '*/'", state->cursor); + raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end); } else { state->cursor++; if (state->cursor < state->end && *state->cursor == '/') { @@ -498,11 +567,11 @@ json_eat_comments(JSON_ParserState *state) break; } default: - raise_parse_error("unexpected token at '%s'", state->cursor); + raise_parse_error("unexpected token %s", state); break; } } else { - raise_parse_error("unexpected token at '%s'", state->cursor); + raise_parse_error("unexpected token %s", state); } } @@ -621,9 +690,9 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c break; case 'u': if (pe > stringEnd - 5) { - raise_parse_error("incomplete unicode character escape sequence at '%s'", p); + raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p); } else { - uint32_t ch = unescape_unicode((unsigned char *) ++pe); + uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe); pe += 3; /* To handle values above U+FFFF, we take a sequence of * \uXXXX escapes in the U+D800..U+DBFF then @@ -638,10 +707,10 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c if ((ch & 0xFC00) == 0xD800) { pe++; if (pe > stringEnd - 6) { - raise_parse_error("incomplete surrogate pair at '%s'", p); + raise_parse_error_at("incomplete surrogate pair at %s", state, p); } if (pe[0] == '\\' && pe[1] == 'u') { - uint32_t sur = unescape_unicode((unsigned char *) pe + 2); + uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2); ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) | (sur & 0x3FF)); pe += 5; @@ -761,11 +830,25 @@ static inline VALUE json_decode_array(JSON_ParserState *state, JSON_ParserConfig return array; } -static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, long count) +static inline VALUE json_decode_object(JSON_ParserState *state, JSON_ParserConfig *config, size_t count) { - VALUE object = rb_hash_new_capa(count); + size_t entries_count = count / 2; + VALUE object = rb_hash_new_capa(entries_count); rb_hash_bulk_insert(count, rvalue_stack_peek(state->stack, count), object); + if (RB_UNLIKELY(RHASH_SIZE(object) < entries_count)) { + switch (config->on_duplicate_key) { + case JSON_IGNORE: + break; + case JSON_DEPRECATED: + emit_parse_warning("detected duplicate keys in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`", state); + break; + case JSON_RAISE: + raise_parse_error("duplicate key", state); + break; + } + } + rvalue_stack_pop(state->stack, count); if (config->freeze) { @@ -798,7 +881,7 @@ static inline VALUE json_push_value(JSON_ParserState *state, JSON_ParserConfig * return value; } -static const bool string_scan[256] = { +static const bool string_scan_table[256] = { // ASCII Control Characters 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -811,38 +894,77 @@ static const bool string_scan[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; +#if (defined(__GNUC__ ) || defined(__clang__)) +#define FORCE_INLINE __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + +#ifdef HAVE_SIMD +static SIMD_Implementation simd_impl = SIMD_NONE; +#endif /* HAVE_SIMD */ + +static inline bool FORCE_INLINE string_scan(JSON_ParserState *state) +{ +#ifdef HAVE_SIMD +#if defined(HAVE_SIMD_NEON) + + uint64_t mask = 0; + if (string_scan_simd_neon(&state->cursor, state->end, &mask)) { + state->cursor += trailing_zeros64(mask) >> 2; + return 1; + } + +#elif defined(HAVE_SIMD_SSE2) + if (simd_impl == SIMD_SSE2) { + int mask = 0; + if (string_scan_simd_sse2(&state->cursor, state->end, &mask)) { + state->cursor += trailing_zeros(mask); + return 1; + } + } +#endif /* HAVE_SIMD_NEON or HAVE_SIMD_SSE2 */ +#endif /* HAVE_SIMD */ + + while (state->cursor < state->end) { + if (RB_UNLIKELY(string_scan_table[(unsigned char)*state->cursor])) { + return 1; + } + *state->cursor++; + } + return 0; +} + static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig *config, bool is_name) { state->cursor++; const char *start = state->cursor; bool escaped = false; - while (state->cursor < state->end) { - if (RB_UNLIKELY(string_scan[(unsigned char)*state->cursor])) { - switch (*state->cursor) { - case '"': { - VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name); - state->cursor++; - return json_push_value(state, config, string); - } - case '\\': { - state->cursor++; - escaped = true; - if ((unsigned char)*state->cursor < 0x20) { - raise_parse_error("invalid ASCII control character in string: %s", state->cursor); - } - break; + while (RB_UNLIKELY(string_scan(state))) { + switch (*state->cursor) { + case '"': { + VALUE string = json_decode_string(state, config, start, state->cursor, escaped, is_name); + state->cursor++; + return json_push_value(state, config, string); + } + case '\\': { + state->cursor++; + escaped = true; + if ((unsigned char)*state->cursor < 0x20) { + raise_parse_error("invalid ASCII control character in string: %s", state); } - default: - raise_parse_error("invalid ASCII control character in string: %s", state->cursor); - break; + break; } + default: + raise_parse_error("invalid ASCII control character in string: %s", state); + break; } state->cursor++; } - raise_parse_error("unexpected end of input, expected closing \"", state->cursor); + raise_parse_error("unexpected end of input, expected closing \"", state); return Qfalse; } @@ -850,7 +972,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) { json_eat_whitespace(state); if (state->cursor >= state->end) { - raise_parse_error("unexpected end of input", state->cursor); + raise_parse_error("unexpected end of input", state); } switch (*state->cursor) { @@ -860,7 +982,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) return json_push_value(state, config, Qnil); } - raise_parse_error("unexpected token at '%s'", state->cursor); + raise_parse_error("unexpected token %s", state); break; case 't': if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) { @@ -868,7 +990,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) return json_push_value(state, config, Qtrue); } - raise_parse_error("unexpected token at '%s'", state->cursor); + raise_parse_error("unexpected token %s", state); break; case 'f': // Note: memcmp with a small power of two compile to an integer comparison @@ -877,7 +999,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) return json_push_value(state, config, Qfalse); } - raise_parse_error("unexpected token at '%s'", state->cursor); + raise_parse_error("unexpected token %s", state); break; case 'N': // Note: memcmp with a small power of two compile to an integer comparison @@ -886,7 +1008,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) return json_push_value(state, config, CNaN); } - raise_parse_error("unexpected token at '%s'", state->cursor); + raise_parse_error("unexpected token %s", state); break; case 'I': if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) { @@ -894,7 +1016,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) return json_push_value(state, config, CInfinity); } - raise_parse_error("unexpected token at '%s'", state->cursor); + raise_parse_error("unexpected token %s", state); break; case '-': // Note: memcmp with a small power of two compile to an integer comparison @@ -903,7 +1025,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor += 9; return json_push_value(state, config, CMinusInfinity); } else { - raise_parse_error("unexpected token at '%s'", state->cursor); + raise_parse_error("unexpected token %s", state); } } // Fallthrough @@ -921,11 +1043,11 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) long integer_length = state->cursor - start; if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) { - raise_parse_error("invalid number: %s", start); + raise_parse_error_at("invalid number: %s", state, start); } else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) { - raise_parse_error("invalid number: %s", start); + raise_parse_error_at("invalid number: %s", state, start); } else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) { - raise_parse_error("invalid number: %s", start); + raise_parse_error_at("invalid number: %s", state, start); } if ((state->cursor < state->end) && (*state->cursor == '.')) { @@ -933,7 +1055,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) state->cursor++; if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') { - raise_parse_error("invalid number: %s", state->cursor); + raise_parse_error("invalid number: %s", state); } while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) { @@ -949,7 +1071,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') { - raise_parse_error("invalid number: %s", state->cursor); + raise_parse_error("invalid number: %s", state); } while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) { @@ -1009,11 +1131,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } } - raise_parse_error("expected ',' or ']' after array value", state->cursor); + raise_parse_error("expected ',' or ']' after array value", state); } break; } case '{': { + const char *object_start_cursor = state->cursor; + state->cursor++; json_eat_whitespace(state); long stack_head = state->stack->head; @@ -1028,13 +1152,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } if (*state->cursor != '"') { - raise_parse_error("expected object key, got '%s", state->cursor); + raise_parse_error("expected object key, got %s", state); } json_parse_string(state, config, true); json_eat_whitespace(state); if ((state->cursor >= state->end) || (*state->cursor != ':')) { - raise_parse_error("expected ':' after object key", state->cursor); + raise_parse_error("expected ':' after object key", state); } state->cursor++; @@ -1048,8 +1172,15 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) if (*state->cursor == '}') { state->cursor++; state->current_nesting--; - long count = state->stack->head - stack_head; - return json_push_value(state, config, json_decode_object(state, config, count)); + size_t count = state->stack->head - stack_head; + + // Temporary rewind cursor in case an error is raised + const char *final_cursor = state->cursor; + state->cursor = object_start_cursor; + VALUE object = json_decode_object(state, config, count); + state->cursor = final_cursor; + + return json_push_value(state, config, object); } if (*state->cursor == ',') { @@ -1063,13 +1194,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } if (*state->cursor != '"') { - raise_parse_error("expected object key, got: '%s'", state->cursor); + raise_parse_error("expected object key, got: %s", state); } json_parse_string(state, config, true); json_eat_whitespace(state); if ((state->cursor >= state->end) || (*state->cursor != ':')) { - raise_parse_error("expected ':' after object key, got: '%s", state->cursor); + raise_parse_error("expected ':' after object key, got: %s", state); } state->cursor++; @@ -1079,24 +1210,24 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config) } } - raise_parse_error("expected ',' or '}' after object value, got: '%s'", state->cursor); + raise_parse_error("expected ',' or '}' after object value, got: %s", state); } break; } default: - raise_parse_error("unexpected character: '%s'", state->cursor); + raise_parse_error("unexpected character: %s", state); break; } - raise_parse_error("unreacheable: '%s'", state->cursor); + raise_parse_error("unreacheable: %s", state); } static void json_ensure_eof(JSON_ParserState *state) { json_eat_whitespace(state); if (state->cursor != state->end) { - raise_parse_error("unexpected token at end of stream '%s'", state->cursor); + raise_parse_error("unexpected token at end of stream %s", state); } } @@ -1138,6 +1269,7 @@ static int parser_config_init_i(VALUE key, VALUE val, VALUE data) else if (key == sym_symbolize_names) { config->symbolize_names = RTEST(val); } else if (key == sym_freeze) { config->freeze = RTEST(val); } else if (key == sym_on_load) { config->on_load_proc = RTEST(val) ? val : Qfalse; } + else if (key == sym_allow_duplicate_key) { config->on_duplicate_key = RTEST(val) ? JSON_IGNORE : JSON_RAISE; } else if (key == sym_decimal_class) { if (RTEST(val)) { if (rb_respond_to(val, i_try_convert)) { @@ -1232,9 +1364,14 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE Vsource) .capa = RVALUE_STACK_INITIAL_CAPA, }; + long len; + const char *start; + RSTRING_GETMEM(Vsource, start, len); + JSON_ParserState _state = { - .cursor = RSTRING_PTR(Vsource), - .end = RSTRING_END(Vsource), + .start = start, + .cursor = start, + .end = start + len, .stack = &stack, }; JSON_ParserState *state = &_state; @@ -1349,6 +1486,7 @@ void Init_parser(void) sym_freeze = ID2SYM(rb_intern("freeze")); sym_on_load = ID2SYM(rb_intern("on_load")); sym_decimal_class = ID2SYM(rb_intern("decimal_class")); + sym_allow_duplicate_key = ID2SYM(rb_intern("allow_duplicate_key")); i_chr = rb_intern("chr"); i_aset = rb_intern("[]="); @@ -1362,4 +1500,8 @@ void Init_parser(void) binary_encindex = rb_ascii8bit_encindex(); utf8_encindex = rb_utf8_encindex(); enc_utf8 = rb_utf8_encoding(); + +#ifdef HAVE_SIMD + simd_impl = find_simd_implementation(); +#endif } diff --git a/ext/json/ext/simd/conf.rb b/ext/json/ext/simd/conf.rb new file mode 100644 index 00000000..8e7d8ee2 --- /dev/null +++ b/ext/json/ext/simd/conf.rb @@ -0,0 +1,20 @@ +case RbConfig::CONFIG['host_cpu'] +when /^(arm|aarch64)/ + # Try to compile a small program using NEON instructions + header, type, init = 'arm_neon.h', 'uint8x16_t', 'vdupq_n_u8(32)' +when /^(x86_64|x64)/ + header, type, init = 'x86intrin.h', '__m128i', '_mm_set1_epi8(32)' +end +if header + have_header(header) && try_compile(<<~SRC) + #{cpp_include(header)} + int main(int argc, char **argv) { + #{type} test = #{init}; + if (argc > 100000) printf("%p", &test); + return 0; + } + SRC + $defs.push("-DJSON_ENABLE_SIMD") +end + +have_header('cpuid.h') diff --git a/ext/json/ext/simd/simd.h b/ext/json/ext/simd/simd.h new file mode 100644 index 00000000..e0cf4754 --- /dev/null +++ b/ext/json/ext/simd/simd.h @@ -0,0 +1,182 @@ +typedef enum { + SIMD_NONE, + SIMD_NEON, + SIMD_SSE2 +} SIMD_Implementation; + +#ifdef JSON_ENABLE_SIMD + +#ifdef __clang__ + #if __has_builtin(__builtin_ctzll) + #define HAVE_BUILTIN_CTZLL 1 + #else + #define HAVE_BUILTIN_CTZLL 0 + #endif +#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + #define HAVE_BUILTIN_CTZLL 1 +#else + #define HAVE_BUILTIN_CTZLL 0 +#endif + +static inline uint32_t trailing_zeros64(uint64_t input) { +#if HAVE_BUILTIN_CTZLL + return __builtin_ctzll(input); +#else + uint32_t trailing_zeros = 0; + uint64_t temp = input; + while ((temp & 1) == 0 && temp > 0) { + trailing_zeros++; + temp >>= 1; + } + return trailing_zeros; +#endif +} + +static inline int trailing_zeros(int input) { + #if HAVE_BUILTIN_CTZLL + return __builtin_ctz(input); + #else + int trailing_zeros = 0; + int temp = input; + while ((temp & 1) == 0 && temp > 0) { + trailing_zeros++; + temp >>= 1; + } + return trailing_zeros; + #endif +} + +#if (defined(__GNUC__ ) || defined(__clang__)) +#define FORCE_INLINE __attribute__((always_inline)) +#else +#define FORCE_INLINE +#endif + + +#define SIMD_MINIMUM_THRESHOLD 6 + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64) +#include + +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 +static inline SIMD_Implementation find_simd_implementation(void) { + return SIMD_NEON; +} + +#define HAVE_SIMD 1 +#define HAVE_SIMD_NEON 1 + +// See: https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon +static inline FORCE_INLINE uint64_t neon_match_mask(uint8x16_t matches) +{ + const uint8x8_t res = vshrn_n_u16(vreinterpretq_u16_u8(matches), 4); + const uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(res), 0); + return mask & 0x8888888888888888ull; +} + +static inline FORCE_INLINE uint64_t compute_chunk_mask_neon(const char *ptr) +{ + uint8x16_t chunk = vld1q_u8((const unsigned char *)ptr); + + // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33 + // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/ + const uint8x16_t too_low_or_dbl_quote = vcltq_u8(veorq_u8(chunk, vdupq_n_u8(2)), vdupq_n_u8(33)); + + uint8x16_t has_backslash = vceqq_u8(chunk, vdupq_n_u8('\\')); + uint8x16_t needs_escape = vorrq_u8(too_low_or_dbl_quote, has_backslash); + return neon_match_mask(needs_escape); +} + +static inline FORCE_INLINE int string_scan_simd_neon(const char **ptr, const char *end, uint64_t *mask) +{ + while(*ptr + sizeof(uint8x16_t) <= end) { + uint64_t chunk_mask = compute_chunk_mask_neon(*ptr); + if (chunk_mask) { + *mask = chunk_mask; + return 1; + } + *ptr += sizeof(uint8x16_t); + } + return 0; +} + +uint8x16x4_t load_uint8x16_4(const unsigned char *table) { + uint8x16x4_t tab; + tab.val[0] = vld1q_u8(table); + tab.val[1] = vld1q_u8(table+16); + tab.val[2] = vld1q_u8(table+32); + tab.val[3] = vld1q_u8(table+48); + return tab; +} + +#endif /* ARM Neon Support.*/ + +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) + +#ifdef HAVE_X86INTRIN_H +#include + +#define HAVE_SIMD 1 +#define HAVE_SIMD_SSE2 1 + +#ifdef HAVE_CPUID_H +#define FIND_SIMD_IMPLEMENTATION_DEFINED 1 + +#if defined(__clang__) || defined(__GNUC__) +#define TARGET_SSE2 __attribute__((target("sse2"))) +#else +#define TARGET_SSE2 +#endif + +#define _mm_cmpge_epu8(a, b) _mm_cmpeq_epi8(_mm_max_epu8(a, b), a) +#define _mm_cmple_epu8(a, b) _mm_cmpge_epu8(b, a) +#define _mm_cmpgt_epu8(a, b) _mm_xor_si128(_mm_cmple_epu8(a, b), _mm_set1_epi8(-1)) +#define _mm_cmplt_epu8(a, b) _mm_cmpgt_epu8(b, a) + +static inline TARGET_SSE2 FORCE_INLINE int compute_chunk_mask_sse2(const char *ptr) +{ + __m128i chunk = _mm_loadu_si128((__m128i const*)ptr); + // Trick: c < 32 || c == 34 can be factored as c ^ 2 < 33 + // https://lemire.me/blog/2025/04/13/detect-control-characters-quotes-and-backslashes-efficiently-using-swar/ + __m128i too_low_or_dbl_quote = _mm_cmplt_epu8(_mm_xor_si128(chunk, _mm_set1_epi8(2)), _mm_set1_epi8(33)); + __m128i has_backslash = _mm_cmpeq_epi8(chunk, _mm_set1_epi8('\\')); + __m128i needs_escape = _mm_or_si128(too_low_or_dbl_quote, has_backslash); + return _mm_movemask_epi8(needs_escape); +} + +static inline TARGET_SSE2 FORCE_INLINE int string_scan_simd_sse2(const char **ptr, const char *end, int *mask) +{ + while (*ptr + sizeof(__m128i) <= end) { + int chunk_mask = compute_chunk_mask_sse2(*ptr); + if (chunk_mask) { + *mask = chunk_mask; + return 1; + } + *ptr += sizeof(__m128i); + } + + return 0; +} + +#include +#endif /* HAVE_CPUID_H */ + +static inline SIMD_Implementation find_simd_implementation(void) { + // TODO Revisit. I think the SSE version now only uses SSE2 instructions. + if (__builtin_cpu_supports("sse2")) { + return SIMD_SSE2; + } + + return SIMD_NONE; +} + +#endif /* HAVE_X86INTRIN_H */ +#endif /* X86_64 Support */ + +#endif /* JSON_ENABLE_SIMD */ + +#ifndef FIND_SIMD_IMPLEMENTATION_DEFINED +static inline SIMD_Implementation find_simd_implementation(void) { + return SIMD_NONE; +} +#endif diff --git a/ext/json/ext/vendor/fpconv.c b/ext/json/ext/vendor/fpconv.c index 2887c648..75efd46f 100644 --- a/ext/json/ext/vendor/fpconv.c +++ b/ext/json/ext/vendor/fpconv.c @@ -41,7 +41,7 @@ typedef struct Fp { int exp; } Fp; -static Fp powers_ten[] = { +static const Fp powers_ten[] = { { 18054884314459144840U, -1220 }, { 13451937075301367670U, -1193 }, { 10022474136428063862U, -1166 }, { 14934650266808366570U, -1140 }, { 11127181549972568877U, -1113 }, { 16580792590934885855U, -1087 }, @@ -92,7 +92,7 @@ static Fp find_cachedpow10(int exp, int* k) { const double one_log_ten = 0.30102999566398114; - int approx = -(exp + npowers) * one_log_ten; + int approx = (int)(-(exp + npowers) * one_log_ten); int idx = (approx - firstpower) / steppowers; while(1) { @@ -123,7 +123,7 @@ static Fp find_cachedpow10(int exp, int* k) #define absv(n) ((n) < 0 ? -(n) : (n)) #define minv(a, b) ((a) < (b) ? (a) : (b)) -static uint64_t tens[] = { +static const uint64_t tens[] = { 10000000000000000000U, 1000000000000000000U, 100000000000000000U, 10000000000000000U, 1000000000000000U, 100000000000000U, 10000000000000U, 1000000000000U, 100000000000U, @@ -244,7 +244,7 @@ static int generate_digits(Fp* fp, Fp* upper, Fp* lower, char* digits, int* K) uint64_t part2 = upper->frac & (one.frac - 1); int idx = 0, kappa = 10; - uint64_t* divp; + const uint64_t* divp; /* 1000000000 */ for(divp = tens + 10; kappa > 0; divp++) { @@ -268,7 +268,7 @@ static int generate_digits(Fp* fp, Fp* upper, Fp* lower, char* digits, int* K) } /* 10 */ - uint64_t* unit = tens + 18; + const uint64_t* unit = tens + 18; while(true) { part2 *= 10; @@ -340,7 +340,7 @@ static int emit_digits(char* digits, int ndigits, char* dest, int K, bool neg) } /* write decimal w/o scientific notation */ - if(K < 0 && (K > -7 || exp < 4)) { + if(K < 0 && (K > -7 || exp < 10)) { int offset = ndigits - absv(K); /* fp < 1.0 -> write leading zero */ if(offset <= 0) { @@ -432,8 +432,8 @@ static int filter_special(double fp, char* dest) * * Input: * fp -> the double to convert, dest -> destination buffer. - * The generated string will never be longer than 24 characters. - * Make sure to pass a pointer to at least 24 bytes of memory. + * The generated string will never be longer than 28 characters. + * Make sure to pass a pointer to at least 28 bytes of memory. * The emitted string will not be null terminated. * * Output: @@ -443,7 +443,7 @@ static int filter_special(double fp, char* dest) * * void print(double d) * { - * char buf[24 + 1] // plus null terminator + * char buf[28 + 1] // plus null terminator * int str_len = fpconv_dtoa(d, buf); * * buf[str_len] = '\0'; @@ -451,7 +451,7 @@ static int filter_special(double fp, char* dest) * } * */ -static int fpconv_dtoa(double d, char dest[24]) +static int fpconv_dtoa(double d, char dest[28]) { char digits[18]; diff --git a/java/src/json/ext/OptionsReader.java b/java/src/json/ext/OptionsReader.java index c372eb1a..829e36b3 100644 --- a/java/src/json/ext/OptionsReader.java +++ b/java/src/json/ext/OptionsReader.java @@ -53,6 +53,13 @@ IRubyObject get(String key) { return opts == null ? null : opts.fastARef(runtime.newSymbol(key)); } + boolean hasKey(String key) { + if (opts == null) { + return false; + } + return opts.hasKey(runtime.newSymbol(key)); + } + boolean getBool(String key, boolean defaultValue) { IRubyObject value = get(key); return value == null ? defaultValue : value.isTrue(); diff --git a/java/src/json/ext/ParserConfig.java b/java/src/json/ext/ParserConfig.java index 692b3dfa..ccfc558e 100644 --- a/java/src/json/ext/ParserConfig.java +++ b/java/src/json/ext/ParserConfig.java @@ -54,6 +54,8 @@ public class ParserConfig extends RubyObject { private int maxNesting; private boolean allowNaN; private boolean allowTrailingComma; + private boolean allowDuplicateKey; + private boolean deprecateDuplicateKey; private boolean symbolizeNames; private boolean freeze; private RubyProc onLoadProc; @@ -177,6 +179,14 @@ public IRubyObject initialize(ThreadContext context, IRubyObject options) { this.allowNaN = opts.getBool("allow_nan", false); this.allowTrailingComma = opts.getBool("allow_trailing_comma", false); this.symbolizeNames = opts.getBool("symbolize_names", false); + if (opts.hasKey("allow_duplicate_key")) { + this.allowDuplicateKey = opts.getBool("allow_duplicate_key", false); + this.deprecateDuplicateKey = false; + } else { + this.allowDuplicateKey = false; + this.deprecateDuplicateKey = true; + } + this.freeze = opts.getBool("freeze", false); this.onLoadProc = opts.getProc("on_load"); @@ -280,19 +290,23 @@ private ParserSession(ParserConfig config, RubyString source, ThreadContext cont this.decoder = new StringDecoder(); } - private RaiseException unexpectedToken(ThreadContext context, int absStart, int absEnd) { + private RaiseException parsingError(ThreadContext context, String message, int absStart, int absEnd) { RubyString msg = context.runtime.newString("unexpected token at '") .cat(data, absStart, Math.min(absEnd - absStart, 32)) .cat((byte)'\''); return newException(context, Utils.M_PARSER_ERROR, msg); } + private RaiseException unexpectedToken(ThreadContext context, int absStart, int absEnd) { + return parsingError(context, "unexpected token at '", absStart, absEnd); + } + -// line 314 "ParserConfig.rl" +// line 328 "ParserConfig.rl" -// line 296 "ParserConfig.java" +// line 310 "ParserConfig.java" private static byte[] init__JSON_value_actions_0() { return new byte [] { @@ -406,7 +420,7 @@ private static byte[] init__JSON_value_from_state_actions_0() static final int JSON_value_en_main = 1; -// line 420 "ParserConfig.rl" +// line 434 "ParserConfig.rl" void parseValue(ThreadContext context, ParserResult res, int p, int pe) { @@ -414,14 +428,14 @@ void parseValue(ThreadContext context, ParserResult res, int p, int pe) { IRubyObject result = null; -// line 418 "ParserConfig.java" +// line 432 "ParserConfig.java" { cs = JSON_value_start; } -// line 427 "ParserConfig.rl" +// line 441 "ParserConfig.rl" -// line 425 "ParserConfig.java" +// line 439 "ParserConfig.java" { int _klen; int _trans = 0; @@ -447,13 +461,13 @@ void parseValue(ThreadContext context, ParserResult res, int p, int pe) { while ( _nacts-- > 0 ) { switch ( _JSON_value_actions[_acts++] ) { case 9: -// line 405 "ParserConfig.rl" +// line 419 "ParserConfig.rl" { p--; { p += 1; _goto_targ = 5; if (true) continue _goto;} } break; -// line 457 "ParserConfig.java" +// line 471 "ParserConfig.java" } } @@ -516,25 +530,25 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) switch ( _JSON_value_actions[_acts++] ) { case 0: -// line 322 "ParserConfig.rl" +// line 336 "ParserConfig.rl" { result = context.nil; } break; case 1: -// line 325 "ParserConfig.rl" +// line 339 "ParserConfig.rl" { result = context.fals; } break; case 2: -// line 328 "ParserConfig.rl" +// line 342 "ParserConfig.rl" { result = context.tru; } break; case 3: -// line 331 "ParserConfig.rl" +// line 345 "ParserConfig.rl" { if (config.allowNaN) { result = getConstant(CONST_NAN); @@ -544,7 +558,7 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) } break; case 4: -// line 338 "ParserConfig.rl" +// line 352 "ParserConfig.rl" { if (config.allowNaN) { result = getConstant(CONST_INFINITY); @@ -554,7 +568,7 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) } break; case 5: -// line 345 "ParserConfig.rl" +// line 359 "ParserConfig.rl" { if (pe > p + 8 && absSubSequence(p, p + 9).equals(JSON_MINUS_INFINITY)) { @@ -583,7 +597,7 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) } break; case 6: -// line 371 "ParserConfig.rl" +// line 385 "ParserConfig.rl" { parseString(context, res, p, pe); if (res.result == null) { @@ -596,7 +610,7 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) } break; case 7: -// line 381 "ParserConfig.rl" +// line 395 "ParserConfig.rl" { currentNesting++; parseArray(context, res, p, pe); @@ -611,7 +625,7 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) } break; case 8: -// line 393 "ParserConfig.rl" +// line 407 "ParserConfig.rl" { currentNesting++; parseObject(context, res, p, pe); @@ -625,7 +639,7 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) } } break; -// line 629 "ParserConfig.java" +// line 643 "ParserConfig.java" } } } @@ -645,7 +659,7 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) break; } } -// line 428 "ParserConfig.rl" +// line 442 "ParserConfig.rl" if (cs >= JSON_value_first_final && result != null) { if (config.freeze) { @@ -658,7 +672,7 @@ else if ( data[p] > _JSON_value_trans_keys[_mid+1] ) } -// line 662 "ParserConfig.java" +// line 676 "ParserConfig.java" private static byte[] init__JSON_integer_actions_0() { return new byte [] { @@ -757,7 +771,7 @@ private static byte[] init__JSON_integer_trans_actions_0() static final int JSON_integer_en_main = 1; -// line 450 "ParserConfig.rl" +// line 464 "ParserConfig.rl" void parseInteger(ThreadContext context, ParserResult res, int p, int pe) { @@ -774,15 +788,15 @@ int parseIntegerInternal(int p, int pe) { int cs; -// line 778 "ParserConfig.java" +// line 792 "ParserConfig.java" { cs = JSON_integer_start; } -// line 466 "ParserConfig.rl" +// line 480 "ParserConfig.rl" int memo = p; -// line 786 "ParserConfig.java" +// line 800 "ParserConfig.java" { int _klen; int _trans = 0; @@ -863,13 +877,13 @@ else if ( data[p] > _JSON_integer_trans_keys[_mid+1] ) switch ( _JSON_integer_actions[_acts++] ) { case 0: -// line 444 "ParserConfig.rl" +// line 458 "ParserConfig.rl" { p--; { p += 1; _goto_targ = 5; if (true) continue _goto;} } break; -// line 873 "ParserConfig.java" +// line 887 "ParserConfig.java" } } } @@ -889,7 +903,7 @@ else if ( data[p] > _JSON_integer_trans_keys[_mid+1] ) break; } } -// line 468 "ParserConfig.rl" +// line 482 "ParserConfig.rl" if (cs < JSON_integer_first_final) { return -1; @@ -909,7 +923,7 @@ RubyInteger bytesToInum(Ruby runtime, ByteList num) { } -// line 913 "ParserConfig.java" +// line 927 "ParserConfig.java" private static byte[] init__JSON_float_actions_0() { return new byte [] { @@ -1011,7 +1025,7 @@ private static byte[] init__JSON_float_trans_actions_0() static final int JSON_float_en_main = 1; -// line 501 "ParserConfig.rl" +// line 515 "ParserConfig.rl" void parseFloat(ThreadContext context, ParserResult res, int p, int pe) { @@ -1030,15 +1044,15 @@ int parseFloatInternal(int p, int pe) { int cs; -// line 1034 "ParserConfig.java" +// line 1048 "ParserConfig.java" { cs = JSON_float_start; } -// line 519 "ParserConfig.rl" +// line 533 "ParserConfig.rl" int memo = p; -// line 1042 "ParserConfig.java" +// line 1056 "ParserConfig.java" { int _klen; int _trans = 0; @@ -1119,13 +1133,13 @@ else if ( data[p] > _JSON_float_trans_keys[_mid+1] ) switch ( _JSON_float_actions[_acts++] ) { case 0: -// line 492 "ParserConfig.rl" +// line 506 "ParserConfig.rl" { p--; { p += 1; _goto_targ = 5; if (true) continue _goto;} } break; -// line 1129 "ParserConfig.java" +// line 1143 "ParserConfig.java" } } } @@ -1145,7 +1159,7 @@ else if ( data[p] > _JSON_float_trans_keys[_mid+1] ) break; } } -// line 521 "ParserConfig.rl" +// line 535 "ParserConfig.rl" if (cs < JSON_float_first_final) { return -1; @@ -1155,7 +1169,7 @@ else if ( data[p] > _JSON_float_trans_keys[_mid+1] ) } -// line 1159 "ParserConfig.java" +// line 1173 "ParserConfig.java" private static byte[] init__JSON_string_actions_0() { return new byte [] { @@ -1257,7 +1271,7 @@ private static byte[] init__JSON_string_trans_actions_0() static final int JSON_string_en_main = 1; -// line 560 "ParserConfig.rl" +// line 574 "ParserConfig.rl" void parseString(ThreadContext context, ParserResult res, int p, int pe) { @@ -1265,15 +1279,15 @@ void parseString(ThreadContext context, ParserResult res, int p, int pe) { IRubyObject result = null; -// line 1269 "ParserConfig.java" +// line 1283 "ParserConfig.java" { cs = JSON_string_start; } -// line 567 "ParserConfig.rl" +// line 581 "ParserConfig.rl" int memo = p; -// line 1277 "ParserConfig.java" +// line 1291 "ParserConfig.java" { int _klen; int _trans = 0; @@ -1354,7 +1368,7 @@ else if ( data[p] > _JSON_string_trans_keys[_mid+1] ) switch ( _JSON_string_actions[_acts++] ) { case 0: -// line 535 "ParserConfig.rl" +// line 549 "ParserConfig.rl" { int offset = byteList.begin(); ByteList decoded = decoder.decode(context, byteList, memo + 1 - offset, @@ -1369,13 +1383,13 @@ else if ( data[p] > _JSON_string_trans_keys[_mid+1] ) } break; case 1: -// line 548 "ParserConfig.rl" +// line 562 "ParserConfig.rl" { p--; { p += 1; _goto_targ = 5; if (true) continue _goto;} } break; -// line 1379 "ParserConfig.java" +// line 1393 "ParserConfig.java" } } } @@ -1395,7 +1409,7 @@ else if ( data[p] > _JSON_string_trans_keys[_mid+1] ) break; } } -// line 569 "ParserConfig.rl" +// line 583 "ParserConfig.rl" if (cs >= JSON_string_first_final && result != null) { if (result instanceof RubyString) { @@ -1416,7 +1430,7 @@ else if ( data[p] > _JSON_string_trans_keys[_mid+1] ) } -// line 1420 "ParserConfig.java" +// line 1434 "ParserConfig.java" private static byte[] init__JSON_array_actions_0() { return new byte [] { @@ -1583,7 +1597,7 @@ private static byte[] init__JSON_array_trans_actions_0() static final int JSON_array_en_main = 1; -// line 623 "ParserConfig.rl" +// line 637 "ParserConfig.rl" void parseArray(ThreadContext context, ParserResult res, int p, int pe) { @@ -1597,14 +1611,14 @@ void parseArray(ThreadContext context, ParserResult res, int p, int pe) { IRubyObject result = RubyArray.newArray(context.runtime); -// line 1601 "ParserConfig.java" +// line 1615 "ParserConfig.java" { cs = JSON_array_start; } -// line 636 "ParserConfig.rl" +// line 650 "ParserConfig.rl" -// line 1608 "ParserConfig.java" +// line 1622 "ParserConfig.java" { int _klen; int _trans = 0; @@ -1647,7 +1661,7 @@ else if ( _widec > _JSON_array_cond_keys[_mid+1] ) case 0: { _widec = 65536 + (data[p] - 0); if ( -// line 594 "ParserConfig.rl" +// line 608 "ParserConfig.rl" config.allowTrailingComma ) _widec += 65536; break; } @@ -1717,7 +1731,7 @@ else if ( _widec > _JSON_array_trans_keys[_mid+1] ) switch ( _JSON_array_actions[_acts++] ) { case 0: -// line 596 "ParserConfig.rl" +// line 610 "ParserConfig.rl" { parseValue(context, res, p, pe); if (res.result == null) { @@ -1730,13 +1744,13 @@ else if ( _widec > _JSON_array_trans_keys[_mid+1] ) } break; case 1: -// line 607 "ParserConfig.rl" +// line 621 "ParserConfig.rl" { p--; { p += 1; _goto_targ = 5; if (true) continue _goto;} } break; -// line 1740 "ParserConfig.java" +// line 1754 "ParserConfig.java" } } } @@ -1756,7 +1770,7 @@ else if ( _widec > _JSON_array_trans_keys[_mid+1] ) break; } } -// line 637 "ParserConfig.rl" +// line 651 "ParserConfig.rl" if (cs >= JSON_array_first_final) { res.update(config.onLoad(context, result), p + 1); @@ -1766,7 +1780,7 @@ else if ( _widec > _JSON_array_trans_keys[_mid+1] ) } -// line 1770 "ParserConfig.java" +// line 1784 "ParserConfig.java" private static byte[] init__JSON_object_actions_0() { return new byte [] { @@ -1943,7 +1957,7 @@ private static byte[] init__JSON_object_trans_actions_0() static final int JSON_object_en_main = 1; -// line 694 "ParserConfig.rl" +// line 721 "ParserConfig.rl" void parseObject(ThreadContext context, ParserResult res, int p, int pe) { @@ -1960,14 +1974,14 @@ void parseObject(ThreadContext context, ParserResult res, int p, int pe) { IRubyObject result = RubyHash.newHash(context.runtime); -// line 1964 "ParserConfig.java" +// line 1978 "ParserConfig.java" { cs = JSON_object_start; } -// line 710 "ParserConfig.rl" +// line 737 "ParserConfig.rl" -// line 1971 "ParserConfig.java" +// line 1985 "ParserConfig.java" { int _klen; int _trans = 0; @@ -2010,7 +2024,7 @@ else if ( _widec > _JSON_object_cond_keys[_mid+1] ) case 0: { _widec = 65536 + (data[p] - 0); if ( -// line 651 "ParserConfig.rl" +// line 665 "ParserConfig.rl" config.allowTrailingComma ) _widec += 65536; break; } @@ -2080,7 +2094,7 @@ else if ( _widec > _JSON_object_trans_keys[_mid+1] ) switch ( _JSON_object_actions[_acts++] ) { case 0: -// line 653 "ParserConfig.rl" +// line 667 "ParserConfig.rl" { parseValue(context, res, p, pe); if (res.result == null) { @@ -2093,7 +2107,7 @@ else if ( _widec > _JSON_object_trans_keys[_mid+1] ) } break; case 1: -// line 664 "ParserConfig.rl" +// line 678 "ParserConfig.rl" { parseString(context, res, p, pe); if (res.result == null) { @@ -2106,18 +2120,31 @@ else if ( _widec > _JSON_object_trans_keys[_mid+1] ) } else { lastName = name; } + + if (!config.allowDuplicateKey) { + if (((RubyHash)result).hasKey(lastName)) { + if (config.deprecateDuplicateKey) { + context.runtime.getWarnings().warning( + "detected duplicate keys in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`" + ); + } else { + throw parsingError(context, "duplicate key", p, pe); + } + } + } + {p = (( res.p))-1;} } } break; case 2: -// line 680 "ParserConfig.rl" +// line 707 "ParserConfig.rl" { p--; { p += 1; _goto_targ = 5; if (true) continue _goto;} } break; -// line 2121 "ParserConfig.java" +// line 2148 "ParserConfig.java" } } } @@ -2137,7 +2164,7 @@ else if ( _widec > _JSON_object_trans_keys[_mid+1] ) break; } } -// line 711 "ParserConfig.rl" +// line 738 "ParserConfig.rl" if (cs < JSON_object_first_final) { res.update(null, p + 1); @@ -2148,7 +2175,7 @@ else if ( _widec > _JSON_object_trans_keys[_mid+1] ) } -// line 2152 "ParserConfig.java" +// line 2179 "ParserConfig.java" private static byte[] init__JSON_actions_0() { return new byte [] { @@ -2251,7 +2278,7 @@ private static byte[] init__JSON_trans_actions_0() static final int JSON_en_main = 1; -// line 740 "ParserConfig.rl" +// line 767 "ParserConfig.rl" public IRubyObject parseImplementation(ThreadContext context) { @@ -2261,16 +2288,16 @@ public IRubyObject parseImplementation(ThreadContext context) { ParserResult res = new ParserResult(); -// line 2265 "ParserConfig.java" +// line 2292 "ParserConfig.java" { cs = JSON_start; } -// line 749 "ParserConfig.rl" +// line 776 "ParserConfig.rl" p = byteList.begin(); pe = p + byteList.length(); -// line 2274 "ParserConfig.java" +// line 2301 "ParserConfig.java" { int _klen; int _trans = 0; @@ -2351,7 +2378,7 @@ else if ( data[p] > _JSON_trans_keys[_mid+1] ) switch ( _JSON_actions[_acts++] ) { case 0: -// line 726 "ParserConfig.rl" +// line 753 "ParserConfig.rl" { parseValue(context, res, p, pe); if (res.result == null) { @@ -2363,7 +2390,7 @@ else if ( data[p] > _JSON_trans_keys[_mid+1] ) } } break; -// line 2367 "ParserConfig.java" +// line 2394 "ParserConfig.java" } } } @@ -2383,7 +2410,7 @@ else if ( data[p] > _JSON_trans_keys[_mid+1] ) break; } } -// line 752 "ParserConfig.rl" +// line 779 "ParserConfig.rl" if (cs >= JSON_first_final && p == pe) { return result; diff --git a/java/src/json/ext/ParserConfig.rl b/java/src/json/ext/ParserConfig.rl index ce0d9438..4bc5d93b 100644 --- a/java/src/json/ext/ParserConfig.rl +++ b/java/src/json/ext/ParserConfig.rl @@ -52,6 +52,8 @@ public class ParserConfig extends RubyObject { private int maxNesting; private boolean allowNaN; private boolean allowTrailingComma; + private boolean allowDuplicateKey; + private boolean deprecateDuplicateKey; private boolean symbolizeNames; private boolean freeze; private RubyProc onLoadProc; @@ -175,6 +177,14 @@ public class ParserConfig extends RubyObject { this.allowNaN = opts.getBool("allow_nan", false); this.allowTrailingComma = opts.getBool("allow_trailing_comma", false); this.symbolizeNames = opts.getBool("symbolize_names", false); + if (opts.hasKey("allow_duplicate_key")) { + this.allowDuplicateKey = opts.getBool("allow_duplicate_key", false); + this.deprecateDuplicateKey = false; + } else { + this.allowDuplicateKey = false; + this.deprecateDuplicateKey = true; + } + this.freeze = opts.getBool("freeze", false); this.onLoadProc = opts.getProc("on_load"); @@ -278,13 +288,17 @@ public class ParserConfig extends RubyObject { this.decoder = new StringDecoder(); } - private RaiseException unexpectedToken(ThreadContext context, int absStart, int absEnd) { + private RaiseException parsingError(ThreadContext context, String message, int absStart, int absEnd) { RubyString msg = context.runtime.newString("unexpected token at '") .cat(data, absStart, Math.min(absEnd - absStart, 32)) .cat((byte)'\''); return newException(context, Utils.M_PARSER_ERROR, msg); } + private RaiseException unexpectedToken(ThreadContext context, int absStart, int absEnd) { + return parsingError(context, "unexpected token at '", absStart, absEnd); + } + %%{ machine JSON_common; @@ -673,6 +687,19 @@ public class ParserConfig extends RubyObject { } else { lastName = name; } + + if (!config.allowDuplicateKey) { + if (((RubyHash)result).hasKey(lastName)) { + if (config.deprecateDuplicateKey) { + context.runtime.getWarnings().warning( + "detected duplicate keys in JSON object. This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true`" + ); + } else { + throw parsingError(context, "duplicate key", p, pe); + } + } + } + fexec res.p; } } diff --git a/json.gemspec b/json.gemspec index 943c78aa..55757310 100644 --- a/json.gemspec +++ b/json.gemspec @@ -44,15 +44,14 @@ spec = Gem::Specification.new do |s| "LEGAL", "README.md", "json.gemspec", - *Dir["lib/**/*.rb"], - ] + ] + Dir.glob("lib/**/*.rb", base: File.expand_path("..", __FILE__)) if java_ext s.platform = 'java' s.files += Dir["lib/json/ext/**/*.jar"] else s.extensions = Dir["ext/json/**/extconf.rb"] - s.files += Dir["ext/json/**/*.{c,h}"] + s.files += Dir["ext/json/**/*.{c,h,rb}"] end end diff --git a/lib/json.rb b/lib/json.rb index dfd9b7df..735f2380 100644 --- a/lib/json.rb +++ b/lib/json.rb @@ -127,6 +127,24 @@ # # --- # +# Option +allow_duplicate_key+ specifies whether duplicate keys in objects +# should be ignored or cause an error to be raised: +# +# When not specified: +# # The last value is used and a deprecation warning emitted. +# JSON.parse('{"a": 1, "a":2}') => {"a" => 2} +# # waring: detected duplicate keys in JSON object. +# # This will raise an error in json 3.0 unless enabled via `allow_duplicate_key: true` +# +# When set to `+true+` +# # The last value is used. +# JSON.parse('{"a": 1, "a":2}') => {"a" => 2} +# +# When set to `+false+`, the future default: +# JSON.parse('{"a": 1, "a":2}') => duplicate key at line 1 column 1 (JSON::ParserError) +# +# --- +# # Option +allow_nan+ (boolean) specifies whether to allow # NaN, Infinity, and MinusInfinity in +source+; # defaults to +false+. @@ -143,8 +161,23 @@ # ruby = JSON.parse(source, {allow_nan: true}) # ruby # => [NaN, Infinity, -Infinity] # +# --- +# +# Option +allow_trailing_comma+ (boolean) specifies whether to allow +# trailing commas in objects and arrays; +# defaults to +false+. +# +# With the default, +false+: +# JSON.parse('[1,]') # unexpected character: ']' at line 1 column 4 (JSON::ParserError) +# +# When enabled: +# JSON.parse('[1,]', allow_trailing_comma: true) # => [1] +# # ====== Output Options # +# Option +freeze+ (boolean) specifies whether the returned objects will be frozen; +# defaults to +false+. +# # Option +symbolize_names+ (boolean) specifies whether returned \Hash keys # should be Symbols; # defaults to +false+ (use Strings). diff --git a/lib/json/common.rb b/lib/json/common.rb index 77d024de..486ec62a 100644 --- a/lib/json/common.rb +++ b/lib/json/common.rb @@ -172,7 +172,7 @@ def generator=(generator) # :nodoc: end end self.state = generator::State - const_set :State, self.state + const_set :State, state ensure $VERBOSE = old end @@ -230,7 +230,9 @@ def self.create_id class JSONError < StandardError; end # This exception is raised if a parser error occurs. - class ParserError < JSONError; end + class ParserError < JSONError + attr_reader :line, :column + end # This exception is raised if the nesting of parsed data structures is too # deep. @@ -266,7 +268,7 @@ def detailed_message(...) # to string interpolation. # # Note: no validation is performed on the provided string. It is the - # responsability of the caller to ensure the string contains valid JSON. + # responsibility of the caller to ensure the string contains valid JSON. Fragment = Struct.new(:json) do def initialize(json) unless string = String.try_convert(json) @@ -488,7 +490,7 @@ def fast_generate(obj, opts = nil) # } # def pretty_generate(obj, opts = nil) - return state.generate(obj) if State === opts + return opts.generate(obj) if State === opts options = PRETTY_GENERATE_OPTIONS @@ -961,6 +963,24 @@ def restore(...) load(...) end module_function :restore + + class << self + private + + def const_missing(const_name) + case const_name + when :PRETTY_STATE_PROTOTYPE + if RUBY_VERSION >= "3.0" + warn "JSON::PRETTY_STATE_PROTOTYPE is deprecated and will be removed in json 3.0.0, just use JSON.pretty_generate", uplevel: 1, category: :deprecated + else + warn "JSON::PRETTY_STATE_PROTOTYPE is deprecated and will be removed in json 3.0.0, just use JSON.pretty_generate", uplevel: 1 + end + state.new(PRETTY_GENERATE_OPTIONS) + else + super + end + end + end # :startdoc: # JSON::Coder holds a parser and generator configuration. @@ -1052,7 +1072,7 @@ def j(*objs) end objs.each do |obj| - puts JSON::generate(obj, :allow_nan => true, :max_nesting => false) + puts JSON.generate(obj, :allow_nan => true, :max_nesting => false) end nil end @@ -1067,7 +1087,7 @@ def jj(*objs) end objs.each do |obj| - puts JSON::pretty_generate(obj, :allow_nan => true, :max_nesting => false) + puts JSON.pretty_generate(obj, :allow_nan => true, :max_nesting => false) end nil end diff --git a/lib/json/ext.rb b/lib/json/ext.rb index 1db5ea12..5bacc5e3 100644 --- a/lib/json/ext.rb +++ b/lib/json/ext.rb @@ -34,12 +34,12 @@ def parse if RUBY_ENGINE == 'truffleruby' require 'json/truffle_ruby/generator' - JSON.generator = ::JSON::TruffleRuby::Generator + JSON.generator = JSON::TruffleRuby::Generator else require 'json/ext/generator' JSON.generator = Generator end end - JSON_LOADED = true unless defined?(::JSON::JSON_LOADED) + JSON_LOADED = true unless defined?(JSON::JSON_LOADED) end diff --git a/lib/json/version.rb b/lib/json/version.rb index 1fa83370..15ebd12f 100644 --- a/lib/json/version.rb +++ b/lib/json/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module JSON - VERSION = '2.11.1' + VERSION = '2.12.2' end diff --git a/test/json/json_common_interface_test.rb b/test/json/json_common_interface_test.rb index 9c125513..745400fa 100644 --- a/test/json/json_common_interface_test.rb +++ b/test/json/json_common_interface_test.rb @@ -91,6 +91,30 @@ def test_fast_generate def test_pretty_generate assert_equal "[\n 1,\n 2,\n 3\n]", JSON.pretty_generate([ 1, 2, 3 ]) + assert_equal <<~JSON.strip, JSON.pretty_generate({ a: { b: "f"}, c: "d"}) + { + "a": { + "b": "f" + }, + "c": "d" + } + JSON + + # Cause the state to be spilled on the heap. + o = Object.new + def o.to_s + "Object" + end + actual = JSON.pretty_generate({ a: { b: o}, c: "d", e: "f"}) + assert_equal <<~JSON.strip, actual + { + "a": { + "b": "Object" + }, + "c": "d", + "e": "f" + } + JSON end def test_load diff --git a/test/json/json_encoding_test.rb b/test/json/json_encoding_test.rb index afffd897..873e96fd 100644 --- a/test/json/json_encoding_test.rb +++ b/test/json/json_encoding_test.rb @@ -1,4 +1,5 @@ # frozen_string_literal: true + require_relative 'test_helper' class JSONEncodingTest < Test::Unit::TestCase @@ -37,7 +38,7 @@ def test_unicode assert_equal '"\u001f"', 0x1f.chr.to_json assert_equal '" "', ' '.to_json assert_equal "\"#{0x7f.chr}\"", 0x7f.chr.to_json - utf8 = [ "© ≠ €! \01" ] + utf8 = ["© ≠ €! \01"] json = '["© ≠ €! \u0001"]' assert_equal json, utf8.to_json(ascii_only: false) assert_equal utf8, parse(json) @@ -78,10 +79,10 @@ def test_chars json = '"\u%04x"' % i i = i.chr assert_equal i, parse(json)[0] - if i == ?\b + if i == "\b" generated = generate(i) - assert '"\b"' == generated || '"\10"' == generated - elsif [?\n, ?\r, ?\t, ?\f].include?(i) + assert ['"\b"', '"\10"'].include?(generated) + elsif ["\n", "\r", "\t", "\f"].include?(i) assert_equal i.dump, generate(i) elsif i.chr < 0x20.chr assert_equal json, generate(i) @@ -92,4 +93,179 @@ def test_chars end assert_equal "\302\200", parse('"\u0080"') end + + def test_deeply_nested_structures + # Test for deeply nested arrays + nesting_level = 100 + deeply_nested = [] + current = deeply_nested + + (nesting_level - 1).times do + current << [] + current = current[0] + end + + json = generate(deeply_nested) + assert_equal deeply_nested, parse(json) + + # Test for deeply nested objects/hashes + deeply_nested_hash = {} + current_hash = deeply_nested_hash + + (nesting_level - 1).times do |i| + current_hash["key#{i}"] = {} + current_hash = current_hash["key#{i}"] + end + + json = generate(deeply_nested_hash) + assert_equal deeply_nested_hash, parse(json) + end + + def test_very_large_json_strings + # Create a large array with repeated elements + large_array = Array.new(10_000) { |i| "item#{i}" } + + json = generate(large_array) + parsed = parse(json) + + assert_equal large_array.size, parsed.size + assert_equal large_array.first, parsed.first + assert_equal large_array.last, parsed.last + + # Create a large hash + large_hash = {} + 10_000.times { |i| large_hash["key#{i}"] = "value#{i}" } + + json = generate(large_hash) + parsed = parse(json) + + assert_equal large_hash.size, parsed.size + assert_equal large_hash["key0"], parsed["key0"] + assert_equal large_hash["key9999"], parsed["key9999"] + end + + def test_invalid_utf8_sequences + # Create strings with invalid UTF-8 sequences + invalid_utf8 = "\xFF\xFF" + + # Test that generating JSON with invalid UTF-8 raises an error + # Different JSON implementations may handle this differently, + # so we'll check if any exception is raised + begin + generate(invalid_utf8) + raise "Expected an exception when generating JSON with invalid UTF8" + rescue StandardError => e + assert true + assert_match(%r{source sequence is illegal/malformed utf-8}, e.message) + end + end + + def test_surrogate_pair_handling + # Test valid surrogate pairs + assert_equal "\u{10000}", parse('"\ud800\udc00"') + assert_equal "\u{10FFFF}", parse('"\udbff\udfff"') + + # The existing test already checks for orphaned high surrogate + assert_raise(JSON::ParserError) { parse('"\ud800"') } + + # Test generating surrogate pairs + utf8_string = "\u{10437}" + generated = generate(utf8_string, ascii_only: true) + assert_match(/\\ud801\\udc37/, generated) + end + + def test_json_escaping_edge_cases + # Test escaping forward slashes + assert_equal "/", parse('"\/"') + + # Test escaping backslashes + assert_equal "\\", parse('"\\\\"') + + # Test escaping quotes + assert_equal '"', parse('"\\""') + + # Multiple escapes in sequence - different JSON parsers might handle escaped forward slashes differently + # Some parsers preserve the escaping, others don't + escaped_result = parse('"\\\\\\"\\/"') + assert_match(/\\"/, escaped_result) + assert_match(%r{/}, escaped_result) + + # Generate string with all special characters + special_chars = "\b\f\n\r\t\"\\" + escaped_json = generate(special_chars) + assert_equal special_chars, parse(escaped_json) + end + + def test_empty_objects_and_arrays + # Test empty objects with different encodings + assert_equal({}, parse('{}')) + assert_equal({}, parse('{}'.encode(Encoding::UTF_16BE))) + assert_equal({}, parse('{}'.encode(Encoding::UTF_16LE))) + assert_equal({}, parse('{}'.encode(Encoding::UTF_32BE))) + assert_equal({}, parse('{}'.encode(Encoding::UTF_32LE))) + + # Test empty arrays with different encodings + assert_equal([], parse('[]')) + assert_equal([], parse('[]'.encode(Encoding::UTF_16BE))) + assert_equal([], parse('[]'.encode(Encoding::UTF_16LE))) + assert_equal([], parse('[]'.encode(Encoding::UTF_32BE))) + assert_equal([], parse('[]'.encode(Encoding::UTF_32LE))) + + # Test generating empty objects and arrays + assert_equal '{}', generate({}) + assert_equal '[]', generate([]) + end + + def test_null_character_handling + # Test parsing null character + assert_equal "\u0000", parse('"\u0000"') + + # Test generating null character + string_with_null = "\u0000" + generated = generate(string_with_null) + assert_equal '"\u0000"', generated + + # Test null characters in middle of string + mixed_string = "before\u0000after" + generated = generate(mixed_string) + assert_equal mixed_string, parse(generated) + end + + def test_whitespace_handling + # Test parsing with various whitespace patterns + assert_equal({}, parse(' { } ')) + assert_equal({}, parse("{\r\n}")) + assert_equal([], parse(" [ \n ] ")) + assert_equal(["a", "b"], parse(" [ \n\"a\",\r\n \"b\"\n ] ")) + assert_equal({ "a" => "b" }, parse(" { \n\"a\" \r\n: \t\"b\"\n } ")) + + # Test with excessive whitespace + excessive_whitespace = " \n\r\t" * 10 + "{}" + " \n\r\t" * 10 + assert_equal({}, parse(excessive_whitespace)) + + # Mixed whitespace in keys and values + mixed_json = '{"a \n b":"c \r\n d"}' + assert_equal({ "a \n b" => "c \r\n d" }, parse(mixed_json)) + end + + def test_control_character_handling + # Test all control characters (U+0000 to U+001F) + (0..0x1F).each do |i| + # Skip already tested ones + next if [0x08, 0x0A, 0x0D, 0x0C, 0x09].include?(i) + + control_char = i.chr('UTF-8') + escaped_json = '"' + "\\u%04x" % i + '"' + assert_equal control_char, parse(escaped_json) + + # Check that the character is properly escaped when generating + assert_match(/\\u00[0-1][0-9a-f]/, generate(control_char)) + end + + # Test string with multiple control characters + control_str = "\u0001\u0002\u0003\u0004" + generated = generate(control_str) + assert_equal control_str, parse(generated) + assert_match(/\\u0001\\u0002\\u0003\\u0004/, generated) + end end diff --git a/test/json/json_ext_parser_test.rb b/test/json/json_ext_parser_test.rb index 8aa62625..e610f642 100644 --- a/test/json/json_ext_parser_test.rb +++ b/test/json/json_ext_parser_test.rb @@ -14,16 +14,35 @@ def test_allocate end def test_error_messages - ex = assert_raise(ParserError) { parse('Infinity') } - assert_equal "unexpected token at 'Infinity'", ex.message + ex = assert_raise(ParserError) { parse('Infinity something') } + unless RUBY_PLATFORM =~ /java/ + assert_equal "unexpected token 'Infinity' at line 1 column 1", ex.message + end + ex = assert_raise(ParserError) { parse('foo bar') } unless RUBY_PLATFORM =~ /java/ - ex = assert_raise(ParserError) { parse('-Infinity') } - assert_equal "unexpected token at '-Infinity'", ex.message + assert_equal "unexpected token 'foo' at line 1 column 1", ex.message end - ex = assert_raise(ParserError) { parse('NaN') } - assert_equal "unexpected token at 'NaN'", ex.message + ex = assert_raise(ParserError) { parse('-Infinity something') } + unless RUBY_PLATFORM =~ /java/ + assert_equal "unexpected token '-Infinity' at line 1 column 1", ex.message + end + + ex = assert_raise(ParserError) { parse('NaN something') } + unless RUBY_PLATFORM =~ /java/ + assert_equal "unexpected token 'NaN' at line 1 column 1", ex.message + end + + ex = assert_raise(ParserError) { parse(' ') } + unless RUBY_PLATFORM =~ /java/ + assert_equal "unexpected end of input at line 1 column 4", ex.message + end + + ex = assert_raise(ParserError) { parse('{ ') } + unless RUBY_PLATFORM =~ /java/ + assert_equal "expected object key, got EOF at line 1 column 5", ex.message + end end if GC.respond_to?(:stress=) diff --git a/test/json/json_generator_test.rb b/test/json/json_generator_test.rb index f87006ac..914b3f4e 100755 --- a/test/json/json_generator_test.rb +++ b/test/json/json_generator_test.rb @@ -122,6 +122,22 @@ def test_generate_pretty assert_equal '666', pretty_generate(666) end + def test_generate_pretty_custom + state = State.new(:space_before => "", :space => "", :indent => "", :object_nl => "\n\n", :array_nl => "") + json = pretty_generate({1=>{}, 2=>['a','b'], 3=>4}, state) + assert_equal(<<~'JSON'.chomp, json) + { + + "1":{}, + + "2":["a","b"], + + "3":4 + + } + JSON + end + def test_generate_custom state = State.new(:space_before => " ", :space => " ", :indent => "", :object_nl => "\n", :array_nl => "") json = generate({1=>{2=>3,4=>[5,6]}}, state) @@ -410,18 +426,34 @@ def test_backslash json = '["\\\\.(?i:gif|jpe?g|png)$"]' assert_equal json, generate(data) # - data = [ '\\"' ] - json = '["\\\\\""]' + data = [ '\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$\\.(?i:gif|jpe?g|png)$' ] + json = '["\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$\\\\.(?i:gif|jpe?g|png)$"]' + assert_equal json, generate(data) + # + data = [ '\\"\\"\\"\\"\\"\\"\\"\\"\\"\\"\\"' ] + json = '["\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\"\\\\\""]' assert_equal json, generate(data) # data = [ '/' ] json = '["/"]' assert_equal json, generate(data) # + data = [ '////////////////////////////////////////////////////////////////////////////////////' ] + json = '["////////////////////////////////////////////////////////////////////////////////////"]' + assert_equal json, generate(data) + # data = [ '/' ] json = '["\/"]' assert_equal json, generate(data, :script_safe => true) # + data = [ '///////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # + data = [ '///////////////////////////////////////////////////////' ] + json = '["\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/"]' + assert_equal json, generate(data, :script_safe => true) + # data = [ "\u2028\u2029" ] json = '["\u2028\u2029"]' assert_equal json, generate(data, :script_safe => true) @@ -438,6 +470,10 @@ def test_backslash json = '["\""]' assert_equal json, generate(data) # + data = ['"""""""""""""""""""""""""'] + json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' + assert_equal json, generate(data) + # data = ["'"] json = '["\\\'"]' assert_equal '["\'"]', generate(data) @@ -445,6 +481,72 @@ def test_backslash data = ["倩", "瀨"] json = '["倩","瀨"]' assert_equal json, generate(data, script_safe: true) + # + data = '["This is a "test" of the emergency broadcast system."]' + json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\"" + assert_equal json, generate(data) + # + data = '\tThis is a test of the emergency broadcast system.' + json = "\"\\\\tThis is a test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This\tis a test of the emergency broadcast system.' + json = "\"This\\\\tis a test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This is\ta test of the emergency broadcast system.' + json = "\"This is\\\\ta test of the emergency broadcast system.\"" + assert_equal json, generate(data) + # + data = 'This is a test of the emergency broadcast\tsystem.' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\"" + assert_equal json, generate(data) + # + data = 'This is a test of the emergency broadcast\tsystem.\n' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\\\\n\"" + assert_equal json, generate(data) + data = '"' * 15 + json = "\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\"" + assert_equal json, generate(data) + data = "\"\"\"\"\"\"\"\"\"\"\"\"\"\"a" + json = "\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"a\"" + assert_equal json, generate(data) + data = "\u0001\u0001\u0001\u0001" + json = "\"\\u0001\\u0001\\u0001\\u0001\"" + assert_equal json, generate(data) + data = "\u0001a\u0001a\u0001a\u0001a" + json = "\"\\u0001a\\u0001a\\u0001a\\u0001a\"" + assert_equal json, generate(data) + data = "\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\"" + assert_equal json, generate(data) + data = "\u0001aa\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\\u0001aa\"" + assert_equal json, generate(data) + data = "\u0001aa\u0001aa\u0001aa\u0001aa\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\\u0001aa\\u0001aa\\u0001aa\\u0001aa\"" + assert_equal json, generate(data) + data = "\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002" + json = "\"\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\"" + assert_equal json, generate(data) + data = "ab\u0002c" + json = "\"ab\\u0002c\"" + assert_equal json, generate(data) + data = "ab\u0002cab\u0002cab\u0002cab\u0002c" + json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002c\"" + assert_equal json, generate(data) + data = "ab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002c" + json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002c\"" + assert_equal json, generate(data) + data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f" + json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\"" + assert_equal json, generate(data) + data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b" + json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\"" + assert_equal json, generate(data) + data = "a\n\t\f\b\n\t\f\b\n\t\f\b\n\t" + json = "\"a\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\"" + assert_equal json, generate(data) end def test_string_subclass @@ -605,6 +707,22 @@ def test_string_subclass_with_to_s assert_equal '{"JSONGeneratorTest::StringWithToS#to_s":1}', JSON.generate(StringWithToS.new => 1) end + def test_string_subclass_with_broken_to_s + klass = Class.new(String) do + def to_s + false + end + end + s = klass.new("test") + assert_equal '["test"]', JSON.generate([s]) + + omit("Can't figure out how to match behavior in java code") if RUBY_PLATFORM == "java" + + assert_raise TypeError do + JSON.generate(s => 1) + end + end + if defined?(JSON::Ext::Generator) and RUBY_PLATFORM != "java" def test_valid_utf8_in_different_encoding utf8_string = "€™" @@ -669,6 +787,19 @@ def test_json_generate_float values = [-1.0, 1.0, 0.0, 12.2, 7.5 / 3.2, 12.0, 100.0, 1000.0] expecteds = ["-1.0", "1.0", "0.0", "12.2", "2.34375", "12.0", "100.0", "1000.0"] + if RUBY_ENGINE == "jruby" + values << 1746861937.7842371 + expecteds << "1.7468619377842371E9" + else + values << 1746861937.7842371 + expecteds << "1746861937.7842371" + end + + if RUBY_ENGINE == "ruby" + values << -2.2471348024634545e-08 << -2.2471348024634545e-09 << -2.2471348024634545e-10 + expecteds << "-0.000000022471348024634545" << "-0.0000000022471348024634545" << "-2.2471348024634546e-10" + end + values.zip(expecteds).each do |value, expected| assert_equal expected, value.to_json end diff --git a/test/json/json_parser_test.rb b/test/json/json_parser_test.rb index 87b78fb0..106492e1 100644 --- a/test/json/json_parser_test.rb +++ b/test/json/json_parser_test.rb @@ -331,6 +331,15 @@ def test_parse_big_integers assert_equal orig, parse(json5) end + def test_parse_duplicate_key + expected = {"a" => 2} + assert_equal expected, parse('{"a": 1, "a": 2}', allow_duplicate_key: true) + assert_raise(ParserError) { parse('{"a": 1, "a": 2}', allow_duplicate_key: false) } + assert_deprecated_warning(/duplicate keys/) do + assert_equal expected, parse('{"a": 1, "a": 2}') + end + end + def test_some_wrong_inputs assert_raise(ParserError) { parse('[] bla') } assert_raise(ParserError) { parse('[] 1') } @@ -460,6 +469,90 @@ def test_backslash json = '["\/"]' data = [ '/' ] assert_equal data, parse(json) + + data = ['"""""""""""""""""""""""""'] + json = '["\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\"\""]' + assert_equal data, parse(json) + + data = '["This is a "test" of the emergency broadcast system."]' + json = "\"[\\\"This is a \\\"test\\\" of the emergency broadcast system.\\\"]\"" + assert_equal data, parse(json) + + data = '\tThis is a test of the emergency broadcast system.' + json = "\"\\\\tThis is a test of the emergency broadcast system.\"" + assert_equal data, parse(json) + + data = 'This\tis a test of the emergency broadcast system.' + json = "\"This\\\\tis a test of the emergency broadcast system.\"" + assert_equal data, parse(json) + + data = 'This is\ta test of the emergency broadcast system.' + json = "\"This is\\\\ta test of the emergency broadcast system.\"" + assert_equal data, parse(json) + + data = 'This is a test of the emergency broadcast\tsystem.' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\"" + assert_equal data, parse(json) + + data = 'This is a test of the emergency broadcast\tsystem.\n' + json = "\"This is a test of the emergency broadcast\\\\tsystem.\\\\n\"" + assert_equal data, parse(json) + + data = '"' * 15 + json = "\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\"" + assert_equal data, parse(json) + + data = "\"\"\"\"\"\"\"\"\"\"\"\"\"\"a" + json = "\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"\\\"a\"" + assert_equal data, parse(json) + + data = "\u0001\u0001\u0001\u0001" + json = "\"\\u0001\\u0001\\u0001\\u0001\"" + assert_equal data, parse(json) + + data = "\u0001a\u0001a\u0001a\u0001a" + json = "\"\\u0001a\\u0001a\\u0001a\\u0001a\"" + assert_equal data, parse(json) + + data = "\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\"" + assert_equal data, parse(json) + + data = "\u0001aa\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\\u0001aa\"" + assert_equal data, parse(json) + + data = "\u0001aa\u0001aa\u0001aa\u0001aa\u0001aa\u0001aa" + json = "\"\\u0001aa\\u0001aa\\u0001aa\\u0001aa\\u0001aa\\u0001aa\"" + assert_equal data, parse(json) + + data = "\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002\u0001a\u0002" + json = "\"\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\\u0001a\\u0002\"" + assert_equal data, parse(json) + + data = "ab\u0002c" + json = "\"ab\\u0002c\"" + assert_equal data, parse(json) + + data = "ab\u0002cab\u0002cab\u0002cab\u0002c" + json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002c\"" + assert_equal data, parse(json) + + data = "ab\u0002cab\u0002cab\u0002cab\u0002cab\u0002cab\u0002c" + json = "\"ab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002cab\\u0002c\"" + assert_equal data, parse(json) + + data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f" + json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\"" + assert_equal data, parse(json) + + data = "\n\t\f\b\n\t\f\b\n\t\f\b\n\t\f\b" + json = "\"\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\"" + assert_equal data, parse(json) + + data = "a\n\t\f\b\n\t\f\b\n\t\f\b\n\t" + json = "\"a\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\\f\\b\\n\\t\"" + assert_equal data, parse(json) end class SubArray < Array @@ -638,7 +731,7 @@ def test_parse_error_message_length error = assert_raise(JSON::ParserError) do JSON.parse('{"foo": ' + ('A' * 500) + '}') end - assert_operator 60, :>, error.message.bytesize + assert_operator 80, :>, error.message.bytesize end def test_parse_error_incomplete_hash @@ -646,7 +739,7 @@ def test_parse_error_incomplete_hash JSON.parse('{"input":{"firstName":"Bob","lastName":"Mob","email":"bob@example.com"}') end if RUBY_ENGINE == "ruby" - assert_equal %(expected ',' or '}' after object value, got: ''), error.message + assert_equal %(expected ',' or '}' after object value, got: EOF at line 1 column 72), error.message end end @@ -654,16 +747,16 @@ def test_parse_error_snippet omit "C ext only test" unless RUBY_ENGINE == "ruby" error = assert_raise(JSON::ParserError) { JSON.parse("あああああああああああああああああああああああ") } - assert_equal "unexpected character: 'ああああああああああ'", error.message + assert_equal "unexpected character: 'ああああああああああ' at line 1 column 1", error.message error = assert_raise(JSON::ParserError) { JSON.parse("aあああああああああああああああああああああああ") } - assert_equal "unexpected character: 'aああああああああああ'", error.message + assert_equal "unexpected character: 'aああああああああああ' at line 1 column 1", error.message error = assert_raise(JSON::ParserError) { JSON.parse("abあああああああああああああああああああああああ") } - assert_equal "unexpected character: 'abあああああああああ'", error.message + assert_equal "unexpected character: 'abあああああああああ' at line 1 column 1", error.message error = assert_raise(JSON::ParserError) { JSON.parse("abcあああああああああああああああああああああああ") } - assert_equal "unexpected character: 'abcあああああああああ'", error.message + assert_equal "unexpected character: 'abcあああああああああ' at line 1 column 1", error.message end def test_parse_leading_slash diff --git a/test/json/ractor_test.rb b/test/json/ractor_test.rb index f857c9a8..dda34c64 100644 --- a/test/json/ractor_test.rb +++ b/test/json/ractor_test.rb @@ -8,6 +8,16 @@ end class JSONInRactorTest < Test::Unit::TestCase + unless Ractor.method_defined?(:value) + module RactorBackport + refine Ractor do + alias_method :value, :take + end + end + + using RactorBackport + end + def test_generate pid = fork do r = Ractor.new do @@ -25,7 +35,7 @@ def test_generate end expected_json = JSON.parse('{"a":2,"b":3.141,"c":"c","d":[1,"b",3.14],"e":{"foo":"bar"},' + '"g":"\\"\\u0000\\u001f","h":1000.0,"i":0.001}') - actual_json = r.take + actual_json = r.value if expected_json == actual_json exit 0