Skip to content

Commit fc1ddcd

Browse files
authored
Faster case-insensitive comparisons. (simdjson#837)
* Faster case-insensitive comparisons.
1 parent e7f774f commit fc1ddcd

File tree

3 files changed

+10
-4
lines changed

3 files changed

+10
-4
lines changed

include/simdjson/document.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,7 @@ class object : protected internal::tape_ref {
313313

314314
/**
315315
* Get the value associated with the given key in a case-insensitive manner.
316+
* It is only guaranteed to work over ASCII inputs.
316317
*
317318
* Note: The key will be matched against **unescaped** JSON.
318319
*

include/simdjson/inline/document.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -686,10 +686,10 @@ inline simdjson_result<element> object::at_key_case_insensitive(const std::strin
686686
for (iterator field = begin(); field != end_field; ++field) {
687687
auto field_key = field.key();
688688
if (key.length() == field_key.length()) {
689-
bool equal = true;
690-
for (size_t i=0; i<field_key.length(); i++) {
691-
equal = equal && std::tolower(key[i]) == std::tolower(field_key[i]);
692-
}
689+
// See For case-insensitive string comparisons, avoid char-by-char functions
690+
// https://lemire.me/blog/2020/04/30/for-case-insensitive-string-comparisons-avoid-char-by-char-functions/
691+
// Note that it might be worth rolling our own strncasecmp function, with vectorization.
692+
const bool equal = (simdjson_strncasecmp(key.data(), field_key.data(), key.length()) == 0);
693693
if (equal) { return field.value(); }
694694
}
695695
}

include/simdjson/portability.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,13 @@ compiling for a known 64-bit platform."
147147
// regular visual studio and clang under visual studio.
148148
// clang under Windows has _stricmp (like visual studio) but not strcasecmp (as clang normally has)
149149
#define simdjson_strcasecmp _stricmp
150+
#define simdjson_strncasecmp _strnicmp
150151
#else
152+
// The strcasecmp, strncasecmp, and strcasestr functions do not work with multibyte strings (e.g. UTF-8).
153+
// So they are only useful for ASCII in our context.
154+
// https://www.gnu.org/software/libunistring/manual/libunistring.html#char-_002a-strings
151155
#define simdjson_strcasecmp strcasecmp
156+
#define simdjson_strncasecmp strncasecmp
152157
#endif
153158
154159
namespace simdjson {

0 commit comments

Comments
 (0)