Skip to content

Commit 3c3a4db

Browse files
authored
Compile under Visual Studio for ARM64 (simdjson#861)
* Modifications so that we can compile under Visual Studio for ARM64 * Let us throw appveyor at this beast.
1 parent 0e6bd22 commit 3c3a4db

File tree

7 files changed

+294
-27
lines changed

7 files changed

+294
-27
lines changed

.appveyor.yml

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,30 +7,37 @@ platform: x64
77
environment:
88
matrix:
99
- job_name: VS2019
10-
CMAKE_ARGS:
10+
CMAKE_ARGS: -A %Platform%
1111
- job_name: VS2019CLANG
12-
CMAKE_ARGS: -T ClangCL
12+
CMAKE_ARGS: -A %Platform% -T ClangCL
13+
- job_name: VS2019ARM
14+
CMAKE_ARGS: -A ARM64 -DCMAKE_CROSSCOMPILING=1 -D SIMDJSON_GOOGLE_BENCHMARKS=OFF # Does Google Benchmark builds under VS ARM?
1315
- job_name: VS2017 (Static, No Threads)
1416
image: Visual Studio 2017
15-
CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF
17+
CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=ON -DSIMDJSON_ENABLE_THREADS=OFF
1618
CTEST_ARGS: -E checkperf
1719
- job_name: VS2019 (Win32)
1820
platform: Win32
19-
CMAKE_ARGS: -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway.
21+
CMAKE_ARGS: -A %Platform% -DSIMDJSON_BUILD_STATIC=OFF -DSIMDJSON_ENABLE_THREADS=ON # This should be the default. Testing anyway.
2022
CTEST_ARGS: -E checkperf
2123

2224
build_script:
23-
- set
2425
- mkdir build
2526
- cd build
2627
- cmake --version
27-
- cmake -A %Platform% %CMAKE_ARGS% --parallel ..
28+
- cmake %CMAKE_ARGS% --parallel ..
2829
- cmake -LH ..
2930
- cmake --build . --config %Configuration% --verbose --parallel
3031

31-
test_script:
32-
- ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel
33-
32+
for:
33+
-
34+
matrix:
35+
except:
36+
- job_name: VS2019ARM
37+
38+
test_script:
39+
- ctest --output-on-failure -C %Configuration% --verbose %CTEST_ARGS% --parallel
40+
3441
clone_folder: c:\projects\simdjson
3542

3643
matrix:

singleheader/amalgamate_demo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Mon 27 Apr 2020 21:20:37 EDT. Do not edit! */
1+
/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
22

33
#include <iostream>
44
#include "simdjson.h"

singleheader/simdjson.cpp

Lines changed: 136 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */
1+
/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
22
/* begin file src/simdjson.cpp */
33
#include "simdjson.h"
44

@@ -708,7 +708,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
708708

709709
/* result might be undefined when input_num is zero */
710710
really_inline int count_ones(uint64_t input_num) {
711-
return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
711+
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
712712
}
713713

714714
really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {
@@ -736,11 +736,90 @@ really_inline bool mul_overflow(uint64_t value1, uint64_t value2, uint64_t *resu
736736
#endif // SIMDJSON_ARM64_BITMANIPULATION_H
737737
/* end file src/arm64/bitmanipulation.h */
738738
/* arm64/intrinsics.h already included: #include "arm64/intrinsics.h" */
739+
#include <type_traits>
740+
739741

740742
namespace simdjson {
741743
namespace arm64 {
742744
namespace simd {
743745

746+
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
747+
namespace {
748+
/**
749+
* make_uint8x16_t initializes a SIMD register (uint8x16_t).
750+
* This is needed because, incredibly, the syntax uint8x16_t x = {1,2,3...}
751+
* is not recognized under Visual Studio! This is a workaround.
752+
* Using a std::initializer_list<uint8_t> as a parameter resulted in
753+
* inefficient code. With the current approach, if the parameters are
754+
* compile-time constants,
755+
* GNU GCC compiles it to ldr, the same as uint8x16_t x = {1,2,3...}.
756+
* You should not use this function except for compile-time constant:
757+
* it is not efficient.
758+
*/
759+
really_inline uint8x16_t make_uint8x16_t(uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4,
760+
uint8_t x5, uint8_t x6, uint8_t x7, uint8_t x8,
761+
uint8_t x9, uint8_t x10, uint8_t x11, uint8_t x12,
762+
uint8_t x13, uint8_t x14, uint8_t x15, uint8_t x16) {
763+
// Doing a load like so end ups generating worse code.
764+
// uint8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
765+
// x9, x10,x11,x12,x13,x14,x15,x16};
766+
// return vld1q_u8(array);
767+
uint8x16_t x{};
768+
// incredibly, Visual Studio does not allow x[0] = x1
769+
x = vsetq_lane_u8(x1, x, 0);
770+
x = vsetq_lane_u8(x2, x, 1);
771+
x = vsetq_lane_u8(x3, x, 2);
772+
x = vsetq_lane_u8(x4, x, 3);
773+
x = vsetq_lane_u8(x5, x, 4);
774+
x = vsetq_lane_u8(x6, x, 5);
775+
x = vsetq_lane_u8(x7, x, 6);
776+
x = vsetq_lane_u8(x8, x, 7);
777+
x = vsetq_lane_u8(x9, x, 8);
778+
x = vsetq_lane_u8(x10, x, 9);
779+
x = vsetq_lane_u8(x11, x, 10);
780+
x = vsetq_lane_u8(x12, x, 11);
781+
x = vsetq_lane_u8(x13, x, 12);
782+
x = vsetq_lane_u8(x14, x, 13);
783+
x = vsetq_lane_u8(x15, x, 14);
784+
x = vsetq_lane_u8(x16, x, 15);
785+
return x;
786+
}
787+
788+
789+
// We have to do the same work for make_int8x16_t
790+
really_inline int8x16_t make_int8x16_t(int8_t x1, int8_t x2, int8_t x3, int8_t x4,
791+
int8_t x5, int8_t x6, int8_t x7, int8_t x8,
792+
int8_t x9, int8_t x10, int8_t x11, int8_t x12,
793+
int8_t x13, int8_t x14, int8_t x15, int8_t x16) {
794+
// Doing a load like so end ups generating worse code.
795+
// int8_t array[16] = {x1, x2, x3, x4, x5, x6, x7, x8,
796+
// x9, x10,x11,x12,x13,x14,x15,x16};
797+
// return vld1q_s8(array);
798+
int8x16_t x{};
799+
// incredibly, Visual Studio does not allow x[0] = x1
800+
x = vsetq_lane_s8(x1, x, 0);
801+
x = vsetq_lane_s8(x2, x, 1);
802+
x = vsetq_lane_s8(x3, x, 2);
803+
x = vsetq_lane_s8(x4, x, 3);
804+
x = vsetq_lane_s8(x5, x, 4);
805+
x = vsetq_lane_s8(x6, x, 5);
806+
x = vsetq_lane_s8(x7, x, 6);
807+
x = vsetq_lane_s8(x8, x, 7);
808+
x = vsetq_lane_s8(x9, x, 8);
809+
x = vsetq_lane_s8(x10, x, 9);
810+
x = vsetq_lane_s8(x11, x, 10);
811+
x = vsetq_lane_s8(x12, x, 11);
812+
x = vsetq_lane_s8(x13, x, 12);
813+
x = vsetq_lane_s8(x14, x, 13);
814+
x = vsetq_lane_s8(x15, x, 14);
815+
x = vsetq_lane_s8(x16, x, 15);
816+
return x;
817+
}
818+
819+
} // namespace
820+
#endif // SIMDJSON_REGULAR_VISUAL_STUDIO
821+
822+
744823
template<typename T>
745824
struct simd8;
746825

@@ -792,8 +871,13 @@ namespace simd {
792871
// We return uint32_t instead of uint16_t because that seems to be more efficient for most
793872
// purposes (cutting it down to uint16_t costs performance in some compilers).
794873
really_inline uint32_t to_bitmask() const {
795-
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
874+
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
875+
const uint8x16_t bit_mask = make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
876+
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
877+
#else
878+
const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
796879
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
880+
#endif
797881
auto minput = *this & bit_mask;
798882
uint8x16_t tmp = vpaddq_u8(minput, minput);
799883
tmp = vpaddq_u8(tmp, tmp);
@@ -818,13 +902,24 @@ namespace simd {
818902
// Splat constructor
819903
really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
820904
// Member-by-member initialization
905+
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
906+
really_inline simd8(
907+
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
908+
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
909+
) : simd8(make_uint8x16_t(
910+
v0, v1, v2, v3, v4, v5, v6, v7,
911+
v8, v9, v10,v11,v12,v13,v14,v15
912+
)) {}
913+
#else
821914
really_inline simd8(
822915
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
823916
uint8_t v8, uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15
824917
) : simd8(uint8x16_t{
825918
v0, v1, v2, v3, v4, v5, v6, v7,
826919
v8, v9, v10,v11,v12,v13,v14,v15
827920
}) {}
921+
#endif
922+
828923
// Repeat 16 values as many times as necessary (usually for lookup tables)
829924
really_inline static simd8<uint8_t> repeat_16(
830925
uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5, uint8_t v6, uint8_t v7,
@@ -898,7 +993,11 @@ namespace simd {
898993
uint64x2_t shufmask64 = {thintable_epi8[mask1], thintable_epi8[mask2]};
899994
uint8x16_t shufmask = vreinterpretq_u8_u64(shufmask64);
900995
// we increment by 0x08 the second half of the mask
996+
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
997+
uint8x16_t inc = make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08);
998+
#else
901999
uint8x16_t inc = {0, 0, 0, 0, 0, 0, 0, 0, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08};
1000+
#endif
9021001
shufmask = vaddq_u8(shufmask, inc);
9031002
// this is the version "nearly pruned"
9041003
uint8x16_t pruned = vqtbl1q_u8(*this, shufmask);
@@ -955,13 +1054,23 @@ namespace simd {
9551054
// Array constructor
9561055
really_inline simd8(const int8_t* values) : simd8(load(values)) {}
9571056
// Member-by-member initialization
1057+
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
1058+
really_inline simd8(
1059+
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
1060+
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
1061+
) : simd8(make_int8x16_t(
1062+
v0, v1, v2, v3, v4, v5, v6, v7,
1063+
v8, v9, v10,v11,v12,v13,v14,v15
1064+
)) {}
1065+
#else
9581066
really_inline simd8(
9591067
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
9601068
int8_t v8, int8_t v9, int8_t v10, int8_t v11, int8_t v12, int8_t v13, int8_t v14, int8_t v15
9611069
) : simd8(int8x16_t{
9621070
v0, v1, v2, v3, v4, v5, v6, v7,
9631071
v8, v9, v10,v11,v12,v13,v14,v15
9641072
}) {}
1073+
#endif
9651074
// Repeat 16 values as many times as necessary (usually for lookup tables)
9661075
really_inline static simd8<int8_t> repeat_16(
9671076
int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5, int8_t v6, int8_t v7,
@@ -977,8 +1086,14 @@ namespace simd {
9771086
really_inline void store(int8_t dst[16]) const { return vst1q_s8(dst, *this); }
9781087

9791088
// Explicit conversion to/from unsigned
1089+
//
1090+
// Under Visual Studio/ARM64 uint8x16_t and int8x16_t are apparently the same type.
1091+
// In theory, we could check this occurence with std::same_as and std::enabled_if but it is C++14
1092+
// and relatively ugly and hard to read.
1093+
#ifndef SIMDJSON_REGULAR_VISUAL_STUDIO
9801094
really_inline explicit simd8(const uint8x16_t other): simd8(vreinterpretq_s8_u8(other)) {}
981-
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(*this); }
1095+
#endif
1096+
really_inline explicit operator simd8<uint8_t>() const { return vreinterpretq_u8_s8(this->value); }
9821097

9831098
// Math
9841099
really_inline simd8<int8_t> operator+(const simd8<int8_t> other) const { return vaddq_s8(*this, other); }
@@ -1092,10 +1207,17 @@ namespace simd {
10921207
}
10931208

10941209
really_inline uint64_t to_bitmask() const {
1210+
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
1211+
const uint8x16_t bit_mask = make_uint8x16_t(
1212+
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
1213+
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
1214+
);
1215+
#else
10951216
const uint8x16_t bit_mask = {
10961217
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
10971218
0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80
10981219
};
1220+
#endif
10991221
// Add each of the elements next to each other, successively, to stuff each 8 byte mask into one.
11001222
uint8x16_t sum0 = vpaddq_u8(this->chunks[0] & bit_mask, this->chunks[1] & bit_mask);
11011223
uint8x16_t sum1 = vpaddq_u8(this->chunks[2] & bit_mask, this->chunks[3] & bit_mask);
@@ -5852,12 +5974,12 @@ struct value128 {
58525974
};
58535975

58545976
#if defined(SIMDJSON_REGULAR_VISUAL_STUDIO) && \
5855-
!defined(_M_X64) // _umul128 for x86, arm, arm64
5856-
#if defined(_M_ARM)
5977+
!defined(_M_X64) && !defined(_M_ARM64)// _umul128 for x86, arm
5978+
// this is a slow emulation routine for 32-bit Windows
5979+
//
58575980
static inline uint64_t __emulu(uint32_t x, uint32_t y) {
58585981
return x * (uint64_t)y;
58595982
}
5860-
#endif
58615983
static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
58625984
uint64_t ad = __emulu((uint32_t)(ab >> 32), (uint32_t)cd);
58635985
uint64_t bd = __emulu((uint32_t)ab, (uint32_t)cd);
@@ -5873,8 +5995,14 @@ static inline uint64_t _umul128(uint64_t ab, uint64_t cd, uint64_t *hi) {
58735995
really_inline value128 full_multiplication(uint64_t value1, uint64_t value2) {
58745996
value128 answer;
58755997
#ifdef SIMDJSON_REGULAR_VISUAL_STUDIO
5876-
answer.low = _umul128(value1, value2, &answer.high);
5998+
#ifdef _M_ARM64
5999+
// ARM64 has native support for 64-bit multiplications, no need to emultate
6000+
answer.high = __umulh(value1, value2);
6001+
answer.low = value1 * value2;
58776002
#else
6003+
answer.low = _umul128(value1, value2, &answer.high); // _umul128 not available on ARM64
6004+
#endif // _M_ARM64
6005+
#else // SIMDJSON_REGULAR_VISUAL_STUDIO
58786006
__uint128_t r = ((__uint128_t)value1) * value2;
58796007
answer.low = uint64_t(r);
58806008
answer.high = uint64_t(r >> 64);

singleheader/simdjson.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on Mon May 4 11:46:14 PDT 2020. Do not edit! */
1+
/* auto-generated on Tue May 5 20:03:59 EDT 2020. Do not edit! */
22
/* begin file include/simdjson.h */
33
#ifndef SIMDJSON_H
44
#define SIMDJSON_H

src/arm64/bitmanipulation.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ really_inline int leading_zeroes(uint64_t input_num) {
4545

4646
/* result might be undefined when input_num is zero */
4747
really_inline int count_ones(uint64_t input_num) {
48-
return vaddv_u8(vcnt_u8((uint8x8_t)input_num));
48+
return vaddv_u8(vcnt_u8(vcreate_u8(input_num)));
4949
}
5050

5151
really_inline bool add_overflow(uint64_t value1, uint64_t value2, uint64_t *result) {

0 commit comments

Comments
 (0)