Better tests.

lemire · lemire · commit 92334a8e288c · 2019-09-02T12:32:44.000-04:00
diff --git a/Makefile b/Makefile
@@ -59,7 +59,7 @@ endif # ifeq ($(SANITIZE),1)
 endif # ifeq ($(MEMSANITIZE),1)
 
 MAINEXECUTABLES=parse minify json2json jsonstats statisticalmodel jsonpointer
-TESTEXECUTABLES=jsoncheck numberparsingcheck stringparsingcheck pointercheck
+TESTEXECUTABLES=jsoncheck integer_tests numberparsingcheck stringparsingcheck pointercheck
 COMPARISONEXECUTABLES=minifiercompetition parsingcompetition parseandstatcompetition distinctuseridcompetition allparserscheckfile allparsingcompetition
 SUPPLEMENTARYEXECUTABLES=parse_noutf8validation parse_nonumberparsing parse_nostringparsing
 
@@ -96,19 +96,21 @@ benchmark:
 	bash ./scripts/parser.sh
 	bash ./scripts/parseandstat.sh
 
-test: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
+test: jsoncheck integer_tests numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
 	./basictests
 	./numberparsingcheck
+	./integer_tests
 	./stringparsingcheck
 	./jsoncheck
 	./pointercheck
 	./scripts/testjson2json.sh
 	./scripts/issue150.sh
 	@echo "It looks like the code is good!"
 
-quiettest: jsoncheck numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
+quiettest: jsoncheck integer_tests numberparsingcheck stringparsingcheck basictests allparserscheckfile minify json2json pointercheck
 	./basictests
 	./numberparsingcheck
+	./integer_tests
 	./stringparsingcheck
 	./jsoncheck
 	./pointercheck
@@ -158,6 +160,10 @@ basictests:tests/basictests.cpp $(HEADERS) $(LIBFILES)
 numberparsingcheck:tests/numberparsingcheck.cpp $(HEADERS) $(LIBFILES)
 	$(CXX) $(CXXFLAGS) -o numberparsingcheck tests/numberparsingcheck.cpp  src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp  src/parsedjson.cpp       -I. $(LIBFLAGS) -DJSON_TEST_NUMBERS
 
+integer_tests:tests/integer_tests.cpp $(HEADERS) $(LIBFILES)
+	$(CXX) $(CXXFLAGS) -o integer_tests tests/integer_tests.cpp  src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp src/stage2_build_tape.cpp src/parsedjson.cpp       -I. $(LIBFLAGS) 
+
+
 
 stringparsingcheck:tests/stringparsingcheck.cpp $(HEADERS) $(LIBFILES)
 	$(CXX) $(CXXFLAGS) -o stringparsingcheck tests/stringparsingcheck.cpp  src/jsonioutil.cpp src/jsonparser.cpp src/simdjson.cpp src/stage1_find_marks.cpp  src/parsedjson.cpp      -I. $(LIBFLAGS) -DJSON_TEST_STRINGS
diff --git a/README.md b/README.md
@@ -346,7 +346,7 @@ _We do not aim to provide a general-purpose JSON library._ A library like RapidJ
 ## Features
 
 - The input string is unmodified. (Parsers like sajson and RapidJSON use the input string as a buffer.)
-- We parse integers and floating-point numbers as separate types which allows us to support large 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long`. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed 64-bit value, we reject the JSON document.
+- We parse integers and floating-point numbers as separate types which allows us to support large signed 64-bit integers in [-9223372036854775808,9223372036854775808), like a Java `long` or a C/C++ `long long` and large unsigned integers up to the value 18446744073709551615. Among the parsers that differentiate between integers and floating-point numbers, not all support 64-bit integers. (For example, sajson rejects JSON files with integers larger than or equal to 2147483648. RapidJSON will parse a file containing an overly long integer like 18446744073709551616 as a floating-point number.) When we cannot represent exactly an integer as a signed or unsigned 64-bit value, we reject the JSON document.
 - We support the full range of 64-bit floating-point numbers (binary64). The values range from ` std::numeric_limits<double>::lowest()`  to `std::numeric_limits<double>::max()`, so from -1.7976e308 all the way to 1.7975e308. Extreme values (less or equal to -1e308, greater or equal to 1e308) are rejected: we refuse to parse the input document.
 - We test for accurate float parsing with a bound on the [unit of least precision (ULP)](https://en.wikipedia.org/wiki/Unit_in_the_last_place) of one. Practically speaking, this implies 15 digits of accuracy or better.
 - We do full UTF-8 validation as part of the parsing. (Parsers like fastjson, gason and dropbox json11 do not do UTF-8 validation. The sajson parser does incomplete UTF-8 validation, accepting code point
diff --git a/include/simdjson/numberparsing.h b/include/simdjson/numberparsing.h
@@ -10,6 +10,7 @@
 #ifdef JSON_TEST_NUMBERS // for unit testing
 void found_invalid_number(const uint8_t *buf);
 void found_integer(int64_t result, const uint8_t *buf);
+void found_unsigned_integer(uint64_t result, const uint8_t *buf);
 void found_float(double result, const uint8_t *buf);
 #endif
 
@@ -370,33 +371,45 @@ static never_inline bool parse_large_integer(const uint8_t *const buf,
   }
   if (negative) {
     if (i > 0x8000000000000000) {
-// overflows!
+       // overflows!
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_invalid_number(buf + offset);
 #endif
       return false; // overflow
     } else if (i == 0x8000000000000000) {
+      // In two's complement, we cannot represent 0x8000000000000000
+      // as a positive signed integer, but the negative version is 
+      // possible.
       constexpr int64_t signed_answer = INT64_MIN;
       pj.write_tape_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
       found_integer(signed_answer, buf + offset);
 #endif
       return is_structural_or_whitespace(*p);
     }
-  } else {
+    int64_t signed_answer = -static_cast<int64_t>(i);
+    pj.write_tape_s64(signed_answer);
 #ifdef JSON_TEST_NUMBERS // for unit testing
-    found_integer(i, buf + offset);
+    found_integer(signed_answer, buf + offset);
 #endif
-    pj.write_tape_u64(i);
     return is_structural_or_whitespace(*p);
-  }
-  int64_t signed_answer =
-      negative ? -static_cast<int64_t>(i) : static_cast<int64_t>(i);
-  pj.write_tape_s64(signed_answer);
+  } else {
+    // we have a positive integer, the contract is that
+    // we try to represent it as a signed integer and only 
+    // fallback on unsigned integers if absolutely necessary.
+    if(i < 0x8000000000000000) {
 #ifdef JSON_TEST_NUMBERS // for unit testing
-  found_integer(signed_answer, buf + offset);
+      found_integer(i, buf + offset);
 #endif
-  return is_structural_or_whitespace(*p);
+      pj.write_tape_s64(i);
+    } else {
+#ifdef JSON_TEST_NUMBERS // for unit testing
+      found_unsigned_integer(i, buf + offset);
+#endif
+      pj.write_tape_u64(i);
+    }
+    return is_structural_or_whitespace(*p);
+  }
 }
 
 // parse the number at buf + offset
diff --git a/jsonchecker/pass22.json b/jsonchecker/pass22.json
@@ -0,0 +1 @@
+18446744073709551615
diff --git a/tape.md b/tape.md
@@ -66,7 +66,7 @@ $ ./json2json -d jsonexamples/small/demo.json
 
 ## General formal of the tape elements
 
-Most tape elements are written as `('c' << 56) + x` where `'c'` is some ASCII character determining the type of the element (out of 't', 'f', 'n', 'l', 'd', '"', '{', '}', '[', ']' ,'r') and where `x` is a 56-bit value called the payload. The payload is normally interpreted as an unsigned 56-bit integer. Note that 56-bit integers can be quite large.
+Most tape elements are written as `('c' << 56) + x` where `'c'` is some ASCII character determining the type of the element (out of 't', 'f', 'n', 'l', 'u', 'd', '"', '{', '}', '[', ']' ,'r') and where `x` is a 56-bit value called the payload. The payload is normally interpreted as an unsigned 56-bit integer. Note that 56-bit integers can be quite large.
 
 
 Performance consideration: We believe that accessing the tape in regular units of 64 bits is more important for performance than saving memory. 
@@ -84,6 +84,8 @@ Simple JSON nodes are represented with one tape element:
 
 Integer values are represented as two 64-bit tape elements:
 - The 64-bit value `('l' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be signed 64-bit values, using two's complement notation.
+- The 64-bit value `('u' << 56)` followed by the 64-bit integer value litterally. Integer values are assumed to be unsigned 64-bit values.
+
 
 Float values are represented as two 64-bit tape elements:
 - The 64-bit value `('d' << 56)` followed by the 64-bit double value litterally in standard IEEE 754 notation.
diff --git a/tests/integer_tests.cpp b/tests/integer_tests.cpp
@@ -36,9 +36,38 @@ static void parse_and_validate(const std::string src, T expected) {
     result = expected == actual;
   }
   std::cout << std::boolalpha << "test: " << result << std::endl;
-  assert(result);
+  if(!result) {
+    std::cerr << "bug detected" << std::endl;   
+    throw std::runtime_error("bug");
+  }
+}
+
+static bool parse_and_check_signed(const std::string src) {
+  std::cout << "src: " << src << ", expecting signed" << std::endl;
+  const padded_string pstr{src};
+  auto json = build_parsed_json(pstr);
+
+  assert(json.is_valid());
+  ParsedJson::Iterator it{json};
+  assert(it.down());
+  assert(it.next());
+  return it.is_integer() && it.is_number();
+}
+
+static bool parse_and_check_unsigned(const std::string src) {
+  std::cout << "src: " << src << ", expecting unsigned" << std::endl;
+  const padded_string pstr{src};
+  auto json = build_parsed_json(pstr);
+
+  assert(json.is_valid());
+  ParsedJson::Iterator it{json};
+  assert(it.down());
+  assert(it.next());
+  return it.is_unsigned_integer() && it.is_number();
 }
 
+
+
 int main() {
   using std::numeric_limits;
   constexpr auto int64_max = numeric_limits<int64_t>::max();
@@ -49,8 +78,17 @@ int main() {
   parse_and_validate(make_json(int64_min), int64_min);
   parse_and_validate(make_json(uint64_max), uint64_max);
   parse_and_validate(make_json(uint64_min), uint64_min);
-
   constexpr auto int64_max_plus1 = static_cast<uint64_t>(int64_max) + 1;
   parse_and_validate(make_json(int64_max_plus1), int64_max_plus1);
+  if(!parse_and_check_signed(make_json(int64_max))) {
+    std::cerr << "bug: large signed integers should be represented as signed integers" << std::endl;
+    return EXIT_FAILURE;
+  }
+  if(!parse_and_check_unsigned(make_json(uint64_max))) {
+    std::cerr << "bug: a large unsigned integers is not represented as an unsigned integer" << std::endl;
+    return EXIT_FAILURE;
+  }
+  std::cout << "All ok." << std::endl;
+  return EXIT_SUCCESS;
 }
 
diff --git a/tests/numberparsingcheck.cpp b/tests/numberparsingcheck.cpp
@@ -90,8 +90,7 @@ void found_integer(int64_t result, const uint8_t *buf) {
   }
 }
 
-// TODO fix duplicated overload
-void found_integer(uint64_t result, const uint8_t *buf) {
+void found_unsigned_integer(uint64_t result, const uint8_t *buf) {
   int_count++;
   char *endptr;
   unsigned long long expected = strtoull((const char *)buf, &endptr, 10);

Original file line number	Diff line number	Diff line change
`@@ -90,8 +90,7 @@ void found_integer(int64_t result, const uint8_t *buf) {`
`90`	`90`	`}`
`91`	`91`	`}`
`92`	`92`
`93`		`-// TODO fix duplicated overload`
`94`		`-void found_integer(uint64_t result, const uint8_t *buf) {`
	`93`	`+void found_unsigned_integer(uint64_t result, const uint8_t *buf) {`
`95`	`94`	`int_count++;`
`96`	`95`	`char *endptr;`
`97`	`96`	`unsigned long long expected = strtoull((const char *)buf, &endptr, 10);`