-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[libc] Implement generic SIMD helper 'simd.h' and implement strlen #152605
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-libc Author: Joseph Huber (jhuber6) ChangesSummary: It's based off of #152389 and Full diff: https://github.com/llvm/llvm-project/pull/152605.diff 7 Files Affected:
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 2196d9e23bba7..e2722ed352f71 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -313,6 +313,15 @@ add_header_library(
libc.src.string.memory_utils.inline_memset
)
+add_header_library(
+ vector
+ HDRS
+ vector.h
+ DEPENDS
+ libc.hdr.stdint_proxy
+ libc.src.__support.macros.attributes
+)
+
add_header_library(
char_vector
HDRS
diff --git a/libc/src/__support/macros/attributes.h b/libc/src/__support/macros/attributes.h
index c6474673de85a..53912dce427d2 100644
--- a/libc/src/__support/macros/attributes.h
+++ b/libc/src/__support/macros/attributes.h
@@ -48,4 +48,12 @@
#define LIBC_PREFERED_TYPE(TYPE)
#endif
+#if __has_attribute(ext_vector_type) && defined(__clang__) && \
+ __clang_major__ >= 15
+#define LIBC_HAS_VECTOR_TYPE 1
+#define LIBC_VECTOR_TYPE(N) __attribute__((ext_vector_type(N)))
+#else
+#define LIBC_HAS_VECTOR_TYPE 0
+#endif
+
#endif // LLVM_LIBC_SRC___SUPPORT_MACROS_ATTRIBUTES_H
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index fde30eadfd83b..fc6099ca6ccc5 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -59,6 +59,10 @@
#endif // LIBC_TARGET_CPU_HAS_ARM_FPU_DOUBLE
#endif // __ARM_FP
+#if defined(__ARM_NEON)
+#define LIBC_TARGET_CPU_HAS_ARM_NEON
+#endif
+
#if defined(__riscv_flen)
// https://github.com/riscv-non-isa/riscv-c-api-doc/blob/main/src/c-api.adoc
#if defined(__riscv_zfhmin)
diff --git a/libc/src/__support/vector.h b/libc/src/__support/vector.h
new file mode 100644
index 0000000000000..581e727349761
--- /dev/null
+++ b/libc/src/__support/vector.h
@@ -0,0 +1,93 @@
+//===-- Helper functions for SIMD extensions --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_VECTOR_H
+#define LLVM_LIBC_SRC___SUPPORT_VECTOR_H
+
+#include "hdr/stdint_proxy.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+#include <stddef.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+static_assert(LIBC_HAS_VECTOR_TYPE, "Compiler does not support vector types.");
+
+namespace vector {
+
+template <size_t N> struct BitmaskTy;
+template <> struct BitmaskTy<1> {
+ using type = uint8_t;
+};
+template <> struct BitmaskTy<8> {
+ using type = uint8_t;
+};
+template <> struct BitmaskTy<16> {
+ using type = uint16_t;
+};
+template <> struct BitmaskTy<32> {
+ using type = uint32_t;
+};
+template <> struct BitmaskTy<64> {
+ using type = uint64_t;
+};
+
+template <typename T> struct Scalar {
+ static constexpr size_t WIDTH = sizeof(T);
+ static constexpr size_t NUM_ELEMENTS = WIDTH / sizeof(T);
+};
+template <typename T> struct SSE2 {
+ static constexpr size_t WIDTH = 16;
+ static constexpr size_t NUM_ELEMENTS = WIDTH / sizeof(T);
+};
+template <typename T> struct AVX2 {
+ static constexpr size_t WIDTH = 32;
+ static constexpr size_t NUM_ELEMENTS = WIDTH / sizeof(T);
+};
+template <typename T> struct AVX512 {
+ static constexpr size_t WIDTH = 64;
+ static constexpr size_t NUM_ELEMENTS = WIDTH / sizeof(T);
+};
+template <typename T> struct Neon {
+ static constexpr size_t WIDTH = 16;
+ static constexpr size_t NUM_ELEMENTS = WIDTH / sizeof(T);
+};
+
+#if defined(LIBC_TARGET_CPU_HAS_AVX512F)
+template <typename T> using Platform = AVX512<T>;
+#elif defined(LIBC_TARGET_CPU_HAS_AVX2)
+template <typename T> using Platform = AVX2<T>;
+#elif defined(LIBC_TARGET_CPU_HAS_SSE2)
+template <typename T> using Platform = SSE2<T>;
+#elif defined(LIBC_TARGET_CPU_HAS_ARM_NEON)
+template <typename T> using Platform = Neon<T>;
+#else
+template <typename T> using Platform = Scalar<T>;
+#endif
+
+template <typename T, size_t N = Platform<T>::NUM_ELEMENTS>
+using Vector = T LIBC_VECTOR_TYPE(N);
+
+template <typename To, typename From, size_t N>
+LIBC_INLINE Vector<To, N> convert(const Vector<From, N> &v) {
+ return __builtin_convertvector(v, Vector<To, N>);
+}
+
+template <typename T, size_t N>
+LIBC_INLINE typename BitmaskTy<N>::type to_bitmask(const Vector<T, N> &v) {
+ return __builtin_bit_cast(typename BitmaskTy<N>::type,
+ convert<bool, T, N>(v));
+}
+} // namespace vector
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_VECTOR_H
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 809decfbe5f08..cee1a80d4566a 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -20,6 +20,7 @@ add_header_library(
libc.hdr.stdint_proxy
libc.src.__support.CPP.bitset
libc.src.__support.CPP.type_traits
+ libc.src.__support.CPP.vector
libc.src.__support.common
${string_config_options}
)
diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
index 80e5783c7890b..e2549f55972b5 100644
--- a/libc/src/string/string_utils.h
+++ b/libc/src/string/string_utils.h
@@ -22,6 +22,10 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#if LIBC_HAS_VECTOR_TYPE
+#include "src/__support/vector.h"
+#endif
+
namespace LIBC_NAMESPACE_DECL {
namespace internal {
@@ -61,7 +65,7 @@ template <typename Word> LIBC_INLINE constexpr bool has_zeroes(Word block) {
}
template <typename Word>
-LIBC_INLINE size_t string_length_wide_read(const char *src) {
+LIBC_INLINE size_t string_length_wide_read_chars(const char *src) {
const char *char_ptr = src;
// Step 1: read 1 byte at a time to align to block size
for (; reinterpret_cast<uintptr_t>(char_ptr) % sizeof(Word) != 0;
@@ -81,6 +85,40 @@ LIBC_INLINE size_t string_length_wide_read(const char *src) {
return static_cast<size_t>(char_ptr - src);
}
+#if LIBC_HAS_VECTOR_TYPE
+LIBC_INLINE size_t string_length_wide_read_vector(const char *src) {
+ using namespace vector;
+
+ const Vector<char> null('\0');
+
+ // Align the pointer to the native vector width and shift out unused byted.
+ const char *aligned = reinterpret_cast<const char *>(
+ reinterpret_cast<uintptr_t>(src) &
+ ~static_cast<uintptr_t>(sizeof(Vector<char>) - 1));
+ const Vector<char> *char_ptr =
+ reinterpret_cast<const Vector<char> *>(aligned);
+ auto bitmask = to_bitmask(*char_ptr == null);
+ if (decltype(bitmask) shifted = bitmask >> (src - aligned))
+ return cpp::countr_zero(shifted);
+
+ // Continue until we find the null byte.
+ for (;;) {
+ ++char_ptr;
+ if (auto bitmask = to_bitmask(*char_ptr == null))
+ return (reinterpret_cast<const char *>(char_ptr) - src) +
+ cpp::countr_zero(bitmask);
+ }
+}
+#endif
+
+LIBC_INLINE size_t string_length_wide_read(const char *src) {
+#if LIBC_HAS_VECTOR_TYPE
+ return string_length_wide_read_vector(src);
+#else
+ return string_length_wide_read_chars(src);
+#endif
+}
+
// Returns the length of a string, denoted by the first occurrence
// of a null terminator.
template <typename T> LIBC_INLINE size_t string_length(const T *src) {
diff --git a/libc/src/string/strlen.cpp b/libc/src/string/strlen.cpp
index 234edb81d4c8c..3cb58df080db5 100644
--- a/libc/src/string/strlen.cpp
+++ b/libc/src/string/strlen.cpp
@@ -11,6 +11,8 @@
#include "src/__support/macros/null_check.h"
#include "src/string/string_utils.h"
+#include "src/__support/vector.h"
+
#include "src/__support/common.h"
namespace LIBC_NAMESPACE_DECL {
@@ -19,6 +21,7 @@ namespace LIBC_NAMESPACE_DECL {
// There might be potential for compiler optimization.
LLVM_LIBC_FUNCTION(size_t, strlen, (const char *src)) {
LIBC_CRASH_ON_NULLPTR(src);
+
return internal::string_length(src);
}
|
04640be
to
1513156
Compare
What will be the design ideas on RVV/SVE2 support? |
I don't think there's a good generic way to access variable length vectors in clang yet, it's all through really Arm specific types. But, I'm not an expert on using those. |
Also, assembly comparison between the hand-written versions for x64 in the other patch, seems roughly the same https://godbolt.org/z/nbPxoMrcM. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Before this lands, I'd like to get a bazel patch ready. Hopefully done by EoD. Consider this non-blocking if I don't get it done.
I'm probably going to rework this, I think the API should take some inspiration from https://en.cppreference.com/w/cpp/experimental/simd.html without copying it exactly. Likely we'd want this to go in (Missed the comment button and accidentally closed it) |
I updated this to provide a more comprehensive header and interface using some of the fixed I've landed in upstream clang. The generated ASM for AVX512 looks like this, so I think it's roughly equivalent to the 'hand-rolled' versions. This one uses the generic version if they are available. strlen: # @srlen
.cfi_startproc
# %bb.0: # %entry
movq %rdi, %rax
andq $-64, %rax
vmovdqa64 (%rax), %zmm0
subl %eax, %edi
vptestnmb %zmm0, %zmm0, %k0
kmovq %k0, %rcx
shrxq %rdi, %rcx, %rcx
testq %rcx, %rcx
jne .LBB0_4
# %bb.1: # %for.cond.preheader
addq $64, %rax
.p2align 4
.LBB0_2: # %for.cond
# =>This Inner Loop Header: Depth=1
vmovdqa64 (%rax), %zmm0
addq $64, %rax
vptestnmb %zmm0, %zmm0, %k0
kortestq %k0, %k0
je .LBB0_2
# %bb.3: # %cleanup.thread
kmovq %k0, %rcx
.LBB0_4: # %cleanup35
tzcntq %rcx, %rax
vzeroupper
retq There's some stuff I could add, mostly helpers around shuffling and whatnot, but I'll save that for later. |
bc721c1
to
1d7f1ff
Compare
libc/src/__support/CPP/simd.h
Outdated
} | ||
template <typename T> | ||
LIBC_INLINE enable_if_simd_t<T> load_aligned(const void *ptr) { | ||
return *reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignof(T))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't you do return load_unaligned(__builtin_assume_aligned(ptr, alignof(T)))
? That would avoid some proably-UB. Also, is there an actual code gen difference?
libc/src/__support/CPP/simd.h
Outdated
template <typename T> | ||
LIBC_INLINE enable_if_simd_t<T> masked_load(simd<bool, simd_size_v<T>> m, | ||
void *ptr) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Personally I'd take a T*
instead. Then the user has to decide whether it's actually safe to reinterpret_cast
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it's less intuitive for the interface to be T = load<T *>(ptr);
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The interface can simply be load(ptr)
assuming ptr
points to the correct type.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, even explicitly specifying the template argument would still just be load<T>(ptr)
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The interface has some SFINAE that makes it an error to pass anything but a vector type, so I think it's safe to do the cast for the user.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can still be wrong about what type you're passing. It's up to you though, I don't have any stake in this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Overall LGTM
#if LIBC_HAS_VECTOR_TYPE | ||
#include "src/string/memory_utils/generic/inline_strlen.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we want the generic vector strlen to be the default or the target specific one? Either way we should probably consider a followup PR which allows configuring this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That'd be my expectation, unless we expect the results to be significantly different.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd expect the target specific versions to be faster, unless there's a reason that they shouldn't be.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://godbolt.org/z/cceGfoYdh has a rough comparison, It's out of date but the core should be the same. the LLVM vectors tend to output quite optimal code. Did you ever get that bazel build updated? I'd like to land this after the CI is done.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That link only has the x86 specific versions, I don't see the generic one. Either way, I have realized I should probably just do proper testing on both of them so I'm not going to block this PR on performance.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's in a separate window, check source 1 and 2. @Sterling-Augustine already did a performance comparison before, this version doesn't change the core implementation he used #152389 (comment). TL;DR, it's pretty much the same, very slightly slower for AVX512. I think the only difference between mine and his is that I do the unaligned load unconditionally. I could probably change that in this PR to match his if we think it's faster.
Summary: This PR introduces a new 'simd.h' header that implements an interface similar to the proposed `stdx::simd` in C++. However, we instead wrap around the LLVM internal type. This makes heavy use of the clang vector extensions and boolean vectors, instead using primitive vector types instead of a class (many benefits to this). I use this interface to implement a generic strlen implementation, but propse we use this for math. Right now this requires a feature only introduced in clang-22.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I checked the bazel build, it's currently set to do byte-by-byte so not blocking in theory: https://github.com/llvm/llvm-project/blob/main/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl#L42
I did turn that flag on and get the bazel set up to work with your simd header (diff is at the bottom comment) but I ran into a more structural issue. Bazel expects all targets to be able to build on their own. Currently if the simd
header is built without the appropriate clang intrinsics it errors (even without the static assert). My opinion is that bazel's "every header must build on its own" policy is pretty reasonable, so I'd suggest wrapping the contents of simd.h
in #if LIBC_HAS_VECTOR_TYPE
and possibly putting a warning in an #else
block.
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index bfda5385f012..23ef774c1894 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -677,6 +677,18 @@ libc_support_library(
],
)
+libc_support_library(
+ name = "__support_cpp_simd",
+ hdrs = ["src/__support/CPP/simd.h"],
+ deps = [
+ ":__support_cpp_algorithm",
+ ":__support_cpp_bit",
+ ":__support_cpp_type_traits",
+ ":__support_macros_attributes",
+ ":hdr_stdint_proxy",
+ ],
+)
+
libc_support_library(
name = "__support_cpp_span",
hdrs = ["src/__support/CPP/span.h"],
@@ -4938,6 +4950,7 @@ libc_support_library(
"src/string/memory_utils/arm/inline_memset.h",
"src/string/memory_utils/generic/aligned_access.h",
"src/string/memory_utils/generic/byte_per_byte.h",
+ "src/string/memory_utils/generic/inline_strlen.h",
"src/string/memory_utils/inline_bcmp.h",
"src/string/memory_utils/inline_bzero.h",
"src/string/memory_utils/inline_memcmp.h",
@@ -4964,6 +4977,7 @@ libc_support_library(
":__support_cpp_array",
":__support_cpp_bit",
":__support_cpp_cstddef",
+ ":__support_cpp_simd",
":__support_cpp_type_traits",
":__support_macros_attributes",
":__support_macros_optimization",
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl b/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl
index 179fc83e6729..259d4d292fcf 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_configure_options.bzl
@@ -39,7 +39,7 @@ LIBC_CONFIGURE_OPTIONS = [
# "LIBC_COPT_SCANF_DISABLE_FLOAT",
# "LIBC_COPT_SCANF_DISABLE_INDEX_MODE",
"LIBC_COPT_STDIO_USE_SYSTEM_FILE",
- # "LIBC_COPT_STRING_UNSAFE_WIDE_READ",
+ "LIBC_COPT_STRING_UNSAFE_WIDE_READ",
# "LIBC_COPT_STRTOFLOAT_DISABLE_CLINGER_FAST_PATH",
# "LIBC_COPT_STRTOFLOAT_DISABLE_EISEL_LEMIRE",
# "LIBC_COPT_STRTOFLOAT_DISABLE_SIMPLE_DECIMAL_CONVERSION",
#if LIBC_HAS_VECTOR_TYPE | ||
#include "src/string/memory_utils/generic/inline_strlen.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That link only has the x86 specific versions, I don't see the generic one. Either way, I have realized I should probably just do proper testing on both of them so I'm not going to block this PR on performance.
Summary:
This PR introduces a new 'simd.h' header that implements an interface
similar to the proposed
stdx::simd
in C++. However, we instead wraparound the LLVM internal type. This makes heavy use of the clang vector
extensions and boolean vectors, instead using primitive vector types
instead of a class (many benefits to this).
I use this interface to implement a generic strlen implementation, but
propse we use this for math. Right now this requires a feature only
introduced in clang-22.