Skip to content

Commit 37e6d1e

Browse files
authored
new number parsing (simdjson#1222)
* Remove our dependency on strtod_l by bundling our own slow path. * Ok. Let us drop strtod entirely. * Trimming down the powers to -342. * Removing useless line. * Many more comments. * Adding some DLL exports. * Let the gods help those who rely on windows+gcc. * Marking the subnormals as unlikely. This is pretty much "performance neutral", but it might help just a bit with twitter.json.
1 parent 1d99266 commit 37e6d1e

11 files changed

+1736
-864
lines changed

include/simdjson/common_defs.h

Lines changed: 6 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,12 @@ namespace internal {
1313
* Defined in src/to_chars
1414
*/
1515
char *to_chars(char *first, const char *last, double value);
16+
/**
17+
* @private
18+
* A number parsing routine.
19+
* Defined in src/from_chars
20+
*/
21+
double from_chars(const char *first) noexcept;
1622
}
1723

1824
#ifndef SIMDJSON_EXCEPTIONS
@@ -209,48 +215,4 @@ namespace std {
209215
#define SIMDJSON_TRY(EXPR) { auto _err = (EXPR); if (_err) { return _err; } }
210216

211217

212-
/**
213-
* We may fall back on the system's number parsing, and we want
214-
* to be able to call a locale-insensitive number parser. It unfortunately
215-
* means that we need to load up locale headers.
216-
* The locale.h header is generally available:
217-
*/
218-
#include <locale.h>
219-
/**
220-
* Determining whether we should import xlocale.h or not is
221-
* a bit of a nightmare. Visual Studio and recent recent GLIBC (GCC) do not need it.
222-
* However, FreeBSD and Apple platforms will need it.
223-
* And we would want to cover as many platforms as possible.
224-
*/
225-
#ifdef __has_include
226-
// This is the easy case: we have __has_include and can check whether
227-
// xlocale is available. If so, we load it up.
228-
#if __has_include(<xlocale.h>)
229-
#include <xlocale.h>
230-
#endif // __has_include
231-
#else // We do not have __has_include
232-
// Here we do not have __has_include
233-
// We first check for __GLIBC__
234-
#ifdef __GLIBC__ // If we have __GLIBC__ then we should have features.h which should help.
235-
// Note that having __GLIBC__ does not imply that we are compiling against glibc. But
236-
// we hope that any platform that defines __GLIBC__ will mimick glibc.
237-
#include <features.h>
238-
// Check whether we have an old GLIBC.
239-
#if !((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ > 25)))
240-
#include <xlocale.h> // Old glibc needs xlocale, otherwise xlocale is unavailable.
241-
#endif // !((__GLIBC__ > 2) || ((__GLIBC__ == 2) && (__GLIBC_MINOR__ > 25)))
242-
#else // __GLIBC__
243-
// Ok. So we do not have __GLIBC__
244-
// We assume that everything that is not GLIBC and not on old freebsd or windows
245-
// needs xlocale.
246-
// It is likely that recent FreeBSD and Apple platforms load xlocale.h next:
247-
#if !(defined(_WIN32) || (__FreeBSD_version < 1000010))
248-
#include <xlocale.h> // Will always happen under apple.
249-
#endif //
250-
#endif // __GLIBC__
251-
#endif // __has_include
252-
/**
253-
* End of the crazy locale headers.
254-
*/
255-
256218
#endif // SIMDJSON_COMMON_DEFS_H

include/simdjson/generic/numberparsing.h

Lines changed: 172 additions & 140 deletions
Large diffs are not rendered by default.

include/simdjson/internal/jsoncharutils_tables.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,6 @@ void found_bad_string(const uint8_t *buf);
1111

1212
namespace simdjson {
1313
namespace internal {
14-
15-
constexpr int FASTFLOAT_SMALLEST_POWER = -325;
16-
constexpr int FASTFLOAT_LARGEST_POWER = 308;
17-
18-
struct value128 {
19-
uint64_t low;
20-
uint64_t high;
21-
};
22-
2314
// structural chars here are
2415
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c (and NULL)
2516
// we are also interested in the four whitespace characters

include/simdjson/internal/numberparsing_tables.h

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,54 @@
55

66
namespace simdjson {
77
namespace internal {
8+
/**
9+
* The smallest non-zero float (binary64) is 2^−1074.
10+
* We take as input numbers of the form w x 10^q where w < 2^64.
11+
* We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076.
12+
* However, we have that
13+
* (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^−1074.
14+
* Thus it is possible for a number of the form w * 10^-342 where
15+
* w is a 64-bit value to be a non-zero floating-point number.
16+
*********
17+
* Any number of form w * 10^309 where w>= 1 is going to be
18+
* infinite in binary64 so we never need to worry about powers
19+
* of 5 greater than 308.
20+
*/
21+
constexpr int smallest_power = -342;
22+
constexpr int largest_power = 308;
23+
24+
/**
25+
* Represents a 128-bit value.
26+
* low: least significant 64 bits.
27+
* high: most significant 64 bits.
28+
*/
29+
struct value128 {
30+
uint64_t low;
31+
uint64_t high;
32+
};
33+
834

935
// Precomputed powers of ten from 10^0 to 10^22. These
1036
// can be represented exactly using the double type.
1137
extern SIMDJSON_DLLIMPORTEXPORT const double power_of_ten[];
12-
// The mantissas of powers of ten from -308 to 308, extended out to sixty four
13-
// bits. The array contains the powers of ten approximated
14-
// as a 64-bit mantissa. It goes from 10^FASTFLOAT_SMALLEST_POWER to
15-
// 10^FASTFLOAT_LARGEST_POWER (inclusively).
16-
// The mantissa is truncated, and
17-
// never rounded up. Uses about 5KB.
18-
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t mantissa_64[];
19-
// A complement to mantissa_64
20-
// complete to a 128-bit mantissa.
21-
// Uses about 5KB but is rarely accessed.
22-
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t mantissa_128[];
2338

39+
40+
/**
41+
* When mapping numbers from decimal to binary,
42+
* we go from w * 10^q to m * 2^p but we have
43+
* 10^q = 5^q * 2^q, so effectively
44+
* we are trying to match
45+
* w * 2^q * 5^q to m * 2^p. Thus the powers of two
46+
* are not a concern since they can be represented
47+
* exactly using the binary notation, only the powers of five
48+
* affect the binary significand.
49+
*/
50+
51+
52+
// The truncated powers of five from 5^-342 all the way to 5^308
53+
// The mantissa is truncated to 128 bits, and
54+
// never rounded up. Uses about 5KB.
55+
extern SIMDJSON_DLLIMPORTEXPORT const uint64_t power_of_five_128[];
2456
} // namespace internal
2557
} // namespace simdjson
2658

0 commit comments

Comments
 (0)