Compile under Visual Studio for ARM64 (simdjson#861)

lemire · web-flow · commit 3c3a4db54e17 · 2020-05-06T23:08:10.000-04:00
* Modifications so that we can compile under Visual Studio for ARM64
* Let us throw appveyor at this beast.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -7,30 +7,37 @@ platform: x64
 environment:
   matrix:
     - job_name: VS2019
-      CMAKE_ARGS:
+      CMAKE_ARGS:  -A %Platform%
     - job_name: VS2019CLANG
-      CMAKE_ARGS: -T ClangCL
+      CMAKE_ARGS:  -A %Platform% -T ClangCL
+    - job_name: VS2019ARM
+      CMAKE_ARGS: -A ARM64 -DCMAKE_CROSSCOMPILING=1 -D SIMDJSON_GOOGLE_BENCHMARKS=OFF # Does Google Benchmark builds under VS ARM?
     - job_name: VS2017 (Static, No Threads)
       image: Visual Studio 2017
-      CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF
+      CMAKE_ARGS:  -A %Platform% -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF
       CTEST_ARGS: -E checkperf
     - job_name: VS2019 (Win32)
       platform: Win32
-      CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway.
+      CMAKE_ARGS:  -A %Platform% -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway.
       CTEST_ARGS: -E checkperf
 
 build_script:
-  - set
   - mkdir build
   - cd build
   - cmake --version
-  - cmake -A %Platform% %CMAKE_ARGS% --parallel ..
+  - cmake %CMAKE_ARGS% --parallel ..
   - cmake -LH ..
   - cmake --build . --config %Configuration% --verbose --parallel
 
-test_script:
-  - ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel
-    
+for:
+-
+  matrix:
+    except:
+      - job_name: VS2019ARM
+
+  test_script:
+    - ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel
+
 clone_folder: c:\projects\simdjson
 
 matrix:
diff --git a/singleheader/amalgamate_demo.cpp b/singleheader/amalgamate_demo.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on Mon 27 Apr 2020 21:20:37 EDT. Do not edit! */
+/* auto-generated on Tue May  5 20:03:59 EDT 2020. Do not edit! */
 
 #include <iostream>
 #include "simdjson.h"
diff --git a/singleheader/simdjson.cpp b/singleheader/simdjson.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on Mon May  4 11:46:14 PDT 2020. Do not edit! */
+/* auto-generated on Tue May  5 20:03:59 EDT 2020. Do not edit! */
 /* begin file src/simdjson.cpp */
 #include "simdjson.h"
 
@@ -708,7 +708,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
 
 /* result might be undefined when input_num is zero */
 really_inline int count_ones(uint64_t input_num) {
-   return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
+   return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
 }
 
 really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
@@ -736,11 +736,90 @@ really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *resu
 #endif // SIMDJSON_ARM64_BITMANIPULATION_H
 /* end file src/arm64/bitmanipulation.h */
 /* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
+#include <type_traits>
+
 
 namespace simdjson {
 namespace arm64 {
 namespace simd {
 
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+namespace {
+/**
+ * make_uint8x16_t initializes a SIMD register (uint8x16_t).
+ * This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
+ * is not recognized under Visual Studio! This is a workaround.
+ * Using a std::initializer_list<uint8_t>  as a parameter resulted in
+ * inefficient code. With the current approach, if the parameters are
+ * compile-time constants,
+ * GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
+ * You should not use this function except for compile-time constant:
+ * it is not efficient.
+ */
+really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
+                           uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
+                           uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
+                           uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
+  // Doing a load like so end ups generating worse code.
+  // uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+  //                     x9, x10,x11,x12,x13,x14,x15,x16};
+  // return vld1q_u8(array);
+  uint8x16_t x{};
+  // incredibly, Visual Studio does not allow x[0] = x1
+  x = vsetq_lane_u8(x1, x, 0);
+  x = vsetq_lane_u8(x2, x, 1);
+  x = vsetq_lane_u8(x3, x, 2);
+  x = vsetq_lane_u8(x4, x, 3);
+  x = vsetq_lane_u8(x5, x, 4);
+  x = vsetq_lane_u8(x6, x, 5);
+  x = vsetq_lane_u8(x7, x, 6);
+  x = vsetq_lane_u8(x8, x, 7);
+  x = vsetq_lane_u8(x9, x, 8);
+  x = vsetq_lane_u8(x10, x, 9);
+  x = vsetq_lane_u8(x11, x, 10);
+  x = vsetq_lane_u8(x12, x, 11);
+  x = vsetq_lane_u8(x13, x, 12);
+  x = vsetq_lane_u8(x14, x, 13);
+  x = vsetq_lane_u8(x15, x, 14);
+  x = vsetq_lane_u8(x16, x, 15);
+  return x;
+}
+
+
+// We have to do the same work for make_int8x16_t
+really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
+                           int8_t x5, int8_t x6, int8_t x7, int8_t x8,
+                           int8_t x9, int8_t x10, int8_t x11, int8_t x12,
+                           int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
+  // Doing a load like so end ups generating worse code.
+  // int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
+  //                     x9, x10,x11,x12,x13,x14,x15,x16};
+  // return vld1q_s8(array);
+  int8x16_t x{};
+  // incredibly, Visual Studio does not allow x[0] = x1
+  x = vsetq_lane_s8(x1, x, 0);
+  x = vsetq_lane_s8(x2, x, 1);
+  x = vsetq_lane_s8(x3, x, 2);
+  x = vsetq_lane_s8(x4, x, 3);
+  x = vsetq_lane_s8(x5, x, 4);
+  x = vsetq_lane_s8(x6, x, 5);
+  x = vsetq_lane_s8(x7, x, 6);
+  x = vsetq_lane_s8(x8, x, 7);
+  x = vsetq_lane_s8(x9, x, 8);
+  x = vsetq_lane_s8(x10, x, 9);
+  x = vsetq_lane_s8(x11, x, 10);
+  x = vsetq_lane_s8(x12, x, 11);
+  x = vsetq_lane_s8(x13, x, 12);
+  x = vsetq_lane_s8(x14, x, 13);
+  x = vsetq_lane_s8(x15, x, 14);
+  x = vsetq_lane_s8(x16, x, 15);
+  return x;
+}
+
+} // namespace
+#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
+
+
   template<typename T>
   struct simd8;
 
@@ -792,8 +871,13 @@ namespace simd {
     // We return uint32_t instead of uint16_t because that seems to be more efficient for most
     // purposes (cutting it down to uint16_t costs performance in some compilers).
     really_inline uint32_t to_bitmask() const {
-      const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      const uint8x16_t bit_mask =  make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                                   0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+      const uint8x16_t bit_mask =  {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
                                    0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
       auto minput = *this & bit_mask;
       uint8x16_t tmp = vpaddq_u8(minput, minput);
       tmp = vpaddq_u8(tmp, tmp);
@@ -818,13 +902,24 @@ namespace simd {
     // Splat constructor
     really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
     // Member-by-member initialization
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+    really_inline simd8(
+      uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
+      uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
+    ) : simd8(make_uint8x16_t(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+#else
     really_inline simd8(
       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
       uint8_t v8,  uint8_t v9,  uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
     ) : simd8(uint8x16_t{
       v0, v1, v2, v3, v4, v5, v6, v7,
       v8, v9, v10,v11,v12,v13,v14,v15
     }) {}
+#endif
+
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     really_inline static simd8<uint8_t> repeat_16(
       uint8_t v0,  uint8_t v1,  uint8_t v2,  uint8_t v3,  uint8_t v4,  uint8_t v5,  uint8_t v6,  uint8_t v7,
@@ -898,7 +993,11 @@ namespace simd {
       uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
       uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
       // we increment by 0x08 the second half of the mask
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
+#else
       uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
+#endif
       shufmask = vaddq_u8(shufmask, inc);
       // this is the version "nearly pruned"
       uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
@@ -955,13 +1054,23 @@ namespace simd {
     // Array constructor
     really_inline simd8(const int8_t* values) : simd8(load(values)) {}
     // Member-by-member initialization
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+    really_inline simd8(
+      int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
+      int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
+    ) : simd8(make_int8x16_t(
+      v0, v1, v2, v3, v4, v5, v6, v7,
+      v8, v9, v10,v11,v12,v13,v14,v15
+    )) {}
+#else
     really_inline simd8(
       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3, int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
       int8_t v8,  int8_t v9,  int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
     ) : simd8(int8x16_t{
       v0, v1, v2, v3, v4, v5, v6, v7,
       v8, v9, v10,v11,v12,v13,v14,v15
     }) {}
+#endif
     // Repeat 16 values as many times as necessary (usually for lookup tables)
     really_inline static simd8<int8_t> repeat_16(
       int8_t v0,  int8_t v1,  int8_t v2,  int8_t v3,  int8_t v4,  int8_t v5,  int8_t v6,  int8_t v7,
@@ -977,8 +1086,14 @@ namespace simd {
     really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
 
     // Explicit conversion to/from unsigned
+    //
+    // Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
+    // In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
+    // and relatively ugly and hard to read.
+#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
     really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
-    really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); }
+#endif
+    really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
 
     // Math
     really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
@@ -1092,10 +1207,17 @@ namespace simd {
     }
 
     really_inline uint64_t to_bitmask() const {
+#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
+      const uint8x16_t bit_mask = make_uint8x16_t(
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+        0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
+      );
+#else
       const uint8x16_t bit_mask = {
         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
         0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
       };
+#endif
       // Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
       uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
       uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
@@ -5852,12 +5974,12 @@ struct value128 {
 };
 
 #if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) &&                                 \
-    !defined(_M_X64) // _umul128 for x86, arm, arm64
-#if defined(_M_ARM)
+    !defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm
+// this is a slow emulation routine for 32-bit Windows
+//
 static inline uint64_t __emulu(uint32_t x, uint32_t y) {
   return x * (uint64_t)y;
 }
-#endif
 static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
   uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
   uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
@@ -5873,8 +5995,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
 really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
   value128 answer;
 #ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
-  answer.low = _umul128(value1, value2, &answer.high);
+#ifdef _M_ARM64
+  // ARM64 has native support for 64-bit multiplications, no need to emultate
+  answer.high = __umulh(value1, value2);
+  answer.low = value1 * value2;
 #else
+  answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
+#endif // _M_ARM64
+#else // SIMDJSON_REGULAR_VISUAL_STUDIO
   __uint128_t r = ((__uint128_t)value1) * value2;
   answer.low = uint64_t(r);
   answer.high = uint64_t(r >> 64);
diff --git a/singleheader/simdjson.h b/singleheader/simdjson.h
@@ -1,4 +1,4 @@
-/* auto-generated on Mon May  4 11:46:14 PDT 2020. Do not edit! */
+/* auto-generated on Tue May  5 20:03:59 EDT 2020. Do not edit! */
 /* begin file include/simdjson.h */
 #ifndef SIMDJSON_H
 #define SIMDJSON_H
diff --git a/src/arm64/bitmanipulation.h b/src/arm64/bitmanipulation.h
@@ -45,7 +45,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
 
 /* result might be undefined when input_num is zero */
 really_inline int count_ones(uint64_t input_num) {
-   return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
+   return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
 }
 
 really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
diff --git a/src/arm64/simd.h b/src/arm64/simd.h
diff --git a/src/jsoncharutils.h b/src/jsoncharutils.h

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-/* auto-generated on Mon 27 Apr 2020 21:20:37 EDT. Do not edit! */`
	`1`	`+/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */`
`2`	`2`
`3`	`3`	`#include <iostream>`
`4`	`4`	`#include "simdjson.h"`
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */`
	`1`	`+/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */`
`2`	`2`	`/* begin file include/simdjson.h */`
`3`	`3`	`#ifndef SIMDJSON_H`
`4`	`4`	`#define SIMDJSON_H`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ really_inline int leading_zeroes(uint64_t input_num) {`
`45`	`45`
`46`	`46`	`/* result might be undefined when input_num is zero */`
`47`	`47`	`really_inline int count_ones(uint64_t input_num) {`
`48`		`- return vaddv_u8(vcnt_u8((uint8x8_t)input_num));`
	`48`	`+ return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {`