1
- /* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */
1
+ /* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
2
2
/* begin file src/simdjson.cpp */
3
3
#include "simdjson.h"
4
4
@@ -708,7 +708,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
708
708
709
709
/* result might be undefined when input_num is zero */
710
710
really_inline int count_ones(uint64_t input_num) {
711
- return vaddv_u8(vcnt_u8((uint8x8_t) input_num));
711
+ return vaddv_u8(vcnt_u8(vcreate_u8( input_num) ));
712
712
}
713
713
714
714
really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
@@ -736,11 +736,90 @@ really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *resu
736
736
#endif // SIMDJSON_ARM64_BITMANIPULATION_H
737
737
/* end file src/arm64/bitmanipulation.h */
738
738
/* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
739
+ #include <type_traits>
740
+
739
741
740
742
namespace simdjson {
741
743
namespace arm64 {
742
744
namespace simd {
743
745
746
+ #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
747
+ namespace {
748
+ /**
749
+ * make_uint8x16_t initializes a SIMD register (uint8x16_t).
750
+ * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
751
+ * is not recognized under Visual Studio! This is a workaround.
752
+ * Using a std::initializer_list<uint8_t> as a parameter resulted in
753
+ * inefficient code. With the current approach, if the parameters are
754
+ * compile-time constants,
755
+ * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
756
+ * You should not use this function except for compile-time constant:
757
+ * it is not efficient.
758
+ */
759
+ really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
760
+ uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
761
+ uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
762
+ uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
763
+ // Doing a load like so end ups generating worse code.
764
+ // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
765
+ // x9, x10,x11,x12,x13,x14,x15,x16};
766
+ // return vld1q_u8(array);
767
+ uint8x16_t x{};
768
+ // incredibly, Visual Studio does not allow x[0] = x1
769
+ x = vsetq_lane_u8(x1, x, 0);
770
+ x = vsetq_lane_u8(x2, x, 1);
771
+ x = vsetq_lane_u8(x3, x, 2);
772
+ x = vsetq_lane_u8(x4, x, 3);
773
+ x = vsetq_lane_u8(x5, x, 4);
774
+ x = vsetq_lane_u8(x6, x, 5);
775
+ x = vsetq_lane_u8(x7, x, 6);
776
+ x = vsetq_lane_u8(x8, x, 7);
777
+ x = vsetq_lane_u8(x9, x, 8);
778
+ x = vsetq_lane_u8(x10, x, 9);
779
+ x = vsetq_lane_u8(x11, x, 10);
780
+ x = vsetq_lane_u8(x12, x, 11);
781
+ x = vsetq_lane_u8(x13, x, 12);
782
+ x = vsetq_lane_u8(x14, x, 13);
783
+ x = vsetq_lane_u8(x15, x, 14);
784
+ x = vsetq_lane_u8(x16, x, 15);
785
+ return x;
786
+ }
787
+
788
+
789
+ // We have to do the same work for make_int8x16_t
790
+ really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
791
+ int8_t x5, int8_t x6, int8_t x7, int8_t x8,
792
+ int8_t x9, int8_t x10, int8_t x11, int8_t x12,
793
+ int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
794
+ // Doing a load like so end ups generating worse code.
795
+ // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
796
+ // x9, x10,x11,x12,x13,x14,x15,x16};
797
+ // return vld1q_s8(array);
798
+ int8x16_t x{};
799
+ // incredibly, Visual Studio does not allow x[0] = x1
800
+ x = vsetq_lane_s8(x1, x, 0);
801
+ x = vsetq_lane_s8(x2, x, 1);
802
+ x = vsetq_lane_s8(x3, x, 2);
803
+ x = vsetq_lane_s8(x4, x, 3);
804
+ x = vsetq_lane_s8(x5, x, 4);
805
+ x = vsetq_lane_s8(x6, x, 5);
806
+ x = vsetq_lane_s8(x7, x, 6);
807
+ x = vsetq_lane_s8(x8, x, 7);
808
+ x = vsetq_lane_s8(x9, x, 8);
809
+ x = vsetq_lane_s8(x10, x, 9);
810
+ x = vsetq_lane_s8(x11, x, 10);
811
+ x = vsetq_lane_s8(x12, x, 11);
812
+ x = vsetq_lane_s8(x13, x, 12);
813
+ x = vsetq_lane_s8(x14, x, 13);
814
+ x = vsetq_lane_s8(x15, x, 14);
815
+ x = vsetq_lane_s8(x16, x, 15);
816
+ return x;
817
+ }
818
+
819
+ } // namespace
820
+ #endif // SIMDJSON_REGULAR_VISUAL_STUDIO
821
+
822
+
744
823
template<typename T>
745
824
struct simd8;
746
825
@@ -792,8 +871,13 @@ namespace simd {
792
871
// We return uint32_t instead of uint16_t because that seems to be more efficient for most
793
872
// purposes (cutting it down to uint16_t costs performance in some compilers).
794
873
really_inline uint32_t to_bitmask() const {
795
- const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
874
+ #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
875
+ const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
876
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
877
+ #else
878
+ const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
796
879
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
880
+ #endif
797
881
auto minput = *this & bit_mask;
798
882
uint8x16_t tmp = vpaddq_u8(minput, minput);
799
883
tmp = vpaddq_u8(tmp, tmp);
@@ -818,13 +902,24 @@ namespace simd {
818
902
// Splat constructor
819
903
really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
820
904
// Member-by-member initialization
905
+ #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
906
+ really_inline simd8(
907
+ uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
908
+ uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
909
+ ) : simd8(make_uint8x16_t(
910
+ v0, v1, v2, v3, v4, v5, v6, v7,
911
+ v8, v9, v10,v11,v12,v13,v14,v15
912
+ )) {}
913
+ #else
821
914
really_inline simd8(
822
915
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
823
916
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
824
917
) : simd8(uint8x16_t{
825
918
v0, v1, v2, v3, v4, v5, v6, v7,
826
919
v8, v9, v10,v11,v12,v13,v14,v15
827
920
}) {}
921
+ #endif
922
+
828
923
// Repeat 16 values as many times as necessary (usually for lookup tables)
829
924
really_inline static simd8<uint8_t> repeat_16(
830
925
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
@@ -898,7 +993,11 @@ namespace simd {
898
993
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
899
994
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
900
995
// we increment by 0x08 the second half of the mask
996
+ #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
997
+ uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
998
+ #else
901
999
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
1000
+ #endif
902
1001
shufmask = vaddq_u8(shufmask, inc);
903
1002
// this is the version "nearly pruned"
904
1003
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
@@ -955,13 +1054,23 @@ namespace simd {
955
1054
// Array constructor
956
1055
really_inline simd8(const int8_t* values) : simd8(load(values)) {}
957
1056
// Member-by-member initialization
1057
+ #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
1058
+ really_inline simd8(
1059
+ int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
1060
+ int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
1061
+ ) : simd8(make_int8x16_t(
1062
+ v0, v1, v2, v3, v4, v5, v6, v7,
1063
+ v8, v9, v10,v11,v12,v13,v14,v15
1064
+ )) {}
1065
+ #else
958
1066
really_inline simd8(
959
1067
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
960
1068
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
961
1069
) : simd8(int8x16_t{
962
1070
v0, v1, v2, v3, v4, v5, v6, v7,
963
1071
v8, v9, v10,v11,v12,v13,v14,v15
964
1072
}) {}
1073
+ #endif
965
1074
// Repeat 16 values as many times as necessary (usually for lookup tables)
966
1075
really_inline static simd8<int8_t> repeat_16(
967
1076
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
@@ -977,8 +1086,14 @@ namespace simd {
977
1086
really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
978
1087
979
1088
// Explicit conversion to/from unsigned
1089
+ //
1090
+ // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
1091
+ // In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
1092
+ // and relatively ugly and hard to read.
1093
+ #ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
980
1094
really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
981
- really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); }
1095
+ #endif
1096
+ really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
982
1097
983
1098
// Math
984
1099
really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
@@ -1092,10 +1207,17 @@ namespace simd {
1092
1207
}
1093
1208
1094
1209
really_inline uint64_t to_bitmask() const {
1210
+ #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
1211
+ const uint8x16_t bit_mask = make_uint8x16_t(
1212
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
1213
+ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
1214
+ );
1215
+ #else
1095
1216
const uint8x16_t bit_mask = {
1096
1217
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
1097
1218
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
1098
1219
};
1220
+ #endif
1099
1221
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
1100
1222
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
1101
1223
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
@@ -5852,12 +5974,12 @@ struct value128 {
5852
5974
};
5853
5975
5854
5976
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \
5855
- !defined(_M_X64) // _umul128 for x86, arm, arm64
5856
- #if defined(_M_ARM)
5977
+ !defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm
5978
+ // this is a slow emulation routine for 32-bit Windows
5979
+ //
5857
5980
static inline uint64_t __emulu(uint32_t x, uint32_t y) {
5858
5981
return x * (uint64_t)y;
5859
5982
}
5860
- #endif
5861
5983
static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
5862
5984
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
5863
5985
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
@@ -5873,8 +5995,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
5873
5995
really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
5874
5996
value128 answer;
5875
5997
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
5876
- answer.low = _umul128(value1, value2, &answer.high);
5998
+ #ifdef _M_ARM64
5999
+ // ARM64 has native support for 64-bit multiplications, no need to emultate
6000
+ answer.high = __umulh(value1, value2);
6001
+ answer.low = value1 * value2;
5877
6002
#else
6003
+ answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
6004
+ #endif // _M_ARM64
6005
+ #else // SIMDJSON_REGULAR_VISUAL_STUDIO
5878
6006
__uint128_t r = ((__uint128_t)value1) * value2;
5879
6007
answer.low = uint64_t(r);
5880
6008
answer.high = uint64_t(r >> 64);
0 commit comments