Skip to content

Conversation

cjappl
Copy link
Contributor

@cjappl cjappl commented Aug 11, 2025

fixes #146120

Follows a pattern put forward in tsan:

INTERCEPTOR(int, pthread_cond_signal, void *c) {
void *cond = init_cond(c);
SCOPED_TSAN_INTERCEPTOR(pthread_cond_signal, cond);
MemoryAccessRange(thr, pc, (uptr)c, sizeof(uptr), false);
return REAL(pthread_cond_signal)(cond);
}

INTERCEPTOR(int, pthread_cond_signal, pthread_cond_t *c) {
InitThread();
pthread_cond_t *cond = init_cond(c);
return REAL(pthread_cond_signal)(cond);
}

To properly deal with memory corruption on older versions of pthread_cond variables.

I was never able to repro this problem, but I'm hopeful it's in the right direction considering the crash report. Seeing as it's basically behind a feature flag, I think introducing it to see if it fixes the problem is low risk.

@llvmbot
Copy link
Member

llvmbot commented Aug 11, 2025

@llvm/pr-subscribers-compiler-rt-sanitizer

Author: Chris Apple (cjappl)

Changes

(hopefully) fixes #146120

Follows a pattern put forward in tsan:

INTERCEPTOR(int, pthread_cond_signal, void *c) {
void *cond = init_cond(c);
SCOPED_TSAN_INTERCEPTOR(pthread_cond_signal, cond);
MemoryAccessRange(thr, pc, (uptr)c, sizeof(uptr), false);
return REAL(pthread_cond_signal)(cond);
}

INTERCEPTOR(int, pthread_cond_signal, pthread_cond_t *c) {
InitThread();
pthread_cond_t *cond = init_cond(c);
return REAL(pthread_cond_signal)(cond);
}

To properly deal with memory corruption on older versions of pthread_cond variables.

I was never able to repro this problem, but I'm hopeful it's in the right direction considering the crash report. Seeing as it's basically behind a feature flag, I think introducing it to see if it fixes the problem is low risk.


Full diff: https://github.com/llvm/llvm-project/pull/152947.diff

1 Files Affected:

  • (modified) compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp (+68-8)
diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index a9d864e9fe926..2f9a80f7ebeda 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -46,12 +46,45 @@
 
 using namespace __sanitizer;
 
+
+#if defined(__x86_64__) || defined(__mips__) || SANITIZER_PPC64V1 || \
+    defined(__s390x__)
+#define PTHREAD_ABI_BASE  "GLIBC_2.3.2"
+#elif defined(__aarch64__) || SANITIZER_PPC64V2
+#define PTHREAD_ABI_BASE  "GLIBC_2.17"
+#elif SANITIZER_LOONGARCH64
+#define PTHREAD_ABI_BASE  "GLIBC_2.36"
+#elif SANITIZER_RISCV64
+#  define PTHREAD_ABI_BASE "GLIBC_2.27"
+#endif
+
+DECLARE_REAL_AND_INTERCEPTOR(void *, malloc, usize size)
+DECLARE_REAL_AND_INTERCEPTOR(void, free, void *ptr)
+
 namespace {
 struct DlsymAlloc : public DlSymAllocator<DlsymAlloc> {
   static bool UseImpl() { return !__rtsan_is_initialized(); }
 };
 } // namespace
 
+// See note in tsan or ddsan as to why this is necessary
+static pthread_cond_t *init_cond(pthread_cond_t *c, bool force = false) {
+  if (!common_flags()->legacy_pthread_cond)
+    return c;
+
+  atomic_uintptr_t *p = (atomic_uintptr_t *)c;
+  uptr cond = atomic_load(p, memory_order_acquire);
+  if (!force && cond != 0)
+    return (pthread_cond_t *)cond;
+  void *newcond = WRAP(malloc)(sizeof(pthread_cond_t));
+  internal_memset(newcond, 0, sizeof(pthread_cond_t));
+  if (atomic_compare_exchange_strong(p, &cond, (uptr)newcond,
+                                     memory_order_acq_rel))
+    return (pthread_cond_t *)newcond;
+  WRAP(free)(newcond);
+  return (pthread_cond_t *)cond;
+}
+
 // Filesystem
 
 INTERCEPTOR(int, open, const char *path, int oflag, ...) {
@@ -766,26 +799,49 @@ INTERCEPTOR(int, pthread_join, pthread_t thread, void **value_ptr) {
   return REAL(pthread_join)(thread, value_ptr);
 }
 
+INTERCEPTOR(int, pthread_cond_init, pthread_cond_t *cond,
+            const pthread_condattr_t *a) {
+  __rtsan_notify_intercepted_call("pthread_cond_init");
+  pthread_cond_t *c = init_cond(cond, true);
+  return REAL(pthread_cond_init)(c, a);
+}
+
 INTERCEPTOR(int, pthread_cond_signal, pthread_cond_t *cond) {
   __rtsan_notify_intercepted_call("pthread_cond_signal");
-  return REAL(pthread_cond_signal)(cond);
+  pthread_cond_t *c = init_cond(cond);
+  return REAL(pthread_cond_signal)(c);
 }
 
 INTERCEPTOR(int, pthread_cond_broadcast, pthread_cond_t *cond) {
   __rtsan_notify_intercepted_call("pthread_cond_broadcast");
-  return REAL(pthread_cond_broadcast)(cond);
+  pthread_cond_t *c = init_cond(cond);
+  return REAL(pthread_cond_broadcast)(c);
 }
 
 INTERCEPTOR(int, pthread_cond_wait, pthread_cond_t *cond,
             pthread_mutex_t *mutex) {
   __rtsan_notify_intercepted_call("pthread_cond_wait");
-  return REAL(pthread_cond_wait)(cond, mutex);
+  pthread_cond_t *c = init_cond(cond);
+  return REAL(pthread_cond_wait)(c, mutex);
 }
 
 INTERCEPTOR(int, pthread_cond_timedwait, pthread_cond_t *cond,
             pthread_mutex_t *mutex, const timespec *ts) {
   __rtsan_notify_intercepted_call("pthread_cond_timedwait");
-  return REAL(pthread_cond_timedwait)(cond, mutex, ts);
+  pthread_cond_t *c = init_cond(cond);
+  return REAL(pthread_cond_timedwait)(c, mutex, ts);
+}
+
+INTERCEPTOR(int, pthread_cond_destroy, pthread_cond_t *cond) {
+  __rtsan_notify_intercepted_call("pthread_cond_destroy");
+  pthread_cond_t *c = init_cond(cond);
+  int res = REAL(pthread_cond_destroy)(c);
+  if (common_flags()->legacy_pthread_cond) {
+    // Free our aux cond and zero the pointer to not leave dangling pointers.
+    WRAP(free)(c);
+    atomic_store((atomic_uintptr_t *)c, 0, memory_order_relaxed);
+  }
+  return res;
 }
 
 INTERCEPTOR(int, pthread_rwlock_rdlock, pthread_rwlock_t *lock) {
@@ -1641,10 +1697,14 @@ void __rtsan::InitializeInterceptors() {
   INTERCEPT_FUNCTION(pthread_mutex_lock);
   INTERCEPT_FUNCTION(pthread_mutex_unlock);
   INTERCEPT_FUNCTION(pthread_join);
-  INTERCEPT_FUNCTION(pthread_cond_signal);
-  INTERCEPT_FUNCTION(pthread_cond_broadcast);
-  INTERCEPT_FUNCTION(pthread_cond_wait);
-  INTERCEPT_FUNCTION(pthread_cond_timedwait);
+
+  INTERCEPT_FUNCTION_VER(pthread_cond_init, PTHREAD_ABI_BASE);
+  INTERCEPT_FUNCTION_VER(pthread_cond_signal, PTHREAD_ABI_BASE);
+  INTERCEPT_FUNCTION_VER(pthread_cond_broadcast, PTHREAD_ABI_BASE);
+  INTERCEPT_FUNCTION_VER(pthread_cond_wait, PTHREAD_ABI_BASE);
+  INTERCEPT_FUNCTION_VER(pthread_cond_timedwait, PTHREAD_ABI_BASE);
+  INTERCEPT_FUNCTION_VER(pthread_cond_destroy, PTHREAD_ABI_BASE);
+
   INTERCEPT_FUNCTION(pthread_rwlock_rdlock);
   INTERCEPT_FUNCTION(pthread_rwlock_unlock);
   INTERCEPT_FUNCTION(pthread_rwlock_wrlock);

Copy link

github-actions bot commented Aug 11, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@cjappl cjappl force-pushed the cjappl/new_pthread_cond_init branch from 5f9843f to c9708f8 Compare August 11, 2025 14:46
Copy link
Member

@MaskRay MaskRay left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is similar to compiler-rt/lib/tsan/rtl/tsan_interceptors_posix.cpp. Looks good if @adrew0809 can confirm that this fixes the issue.

@adrew0809
Copy link

Thanks all. I will try to confirm fix soon.

@cjappl
Copy link
Contributor Author

cjappl commented Aug 15, 2025

I just hopped off a call with @adrew0809 and we confirmed this fixes the issue!

The surprising part is his code no longer crashes when the flag is omitted, or when it is it's default value (false)

./a.out # no crash
RTSAN_OPTIONS='legacy_pthread_cond=false' ./a.out # no crash
RTSAN_OPTIONS='legacy_pthread_cond=true' ./a.out # CRASH

I'm trying to reason about why this might be. Maybe it has something to do with the PTHREAD_ABI_BASE? I'm not entirely sure.

Considering this fixes the issue, I'm inclined to merge. Any objections @fmayer or @davidtrevelyan ?

Copy link
Contributor

@davidtrevelyan davidtrevelyan left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No strong objections to merging this from me. If it fixes the issue on @adrew0809's machine and the CI is happy then I think it's low-enough risk to go for it. I do think we should consider whether it would be feasible to get this issue and its solution under meaningful test - if it's not too hard I think it would be wise to do it soon.

INTERCEPT_FUNCTION(pthread_cond_wait);
INTERCEPT_FUNCTION(pthread_cond_timedwait);

INTERCEPT_FUNCTION_VER(pthread_cond_init, PTHREAD_ABI_BASE);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because pthread_cond_init and pthread_cond_destroy are new, I think we would benefit from unit tests for these interceptors. Sorry I just noticed this now. What do you think? Add them now or post-merge?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good idea, added

@cjappl
Copy link
Contributor Author

cjappl commented Aug 18, 2025

I do think we should consider whether it would be feasible to get this issue and its solution under meaningful test - if it's not too hard I think it would be wise to do it soon.

Copying from our private chain, for future posterity:

The problem with a bigger functional test is that some systems will need this flag, and others will not. Systems that need the new flag will crash when it isn't applied. Systems that don't need the new flag will crash when it is applied

This means in an ideal world, we would figure out the exact specs of a system that needs the flag, determine it in cmake land, and apply or not apply the test

All that's to say...complicated

Tsan does not have a test for this flag, I suspect because of this complexity

namespace {
struct DlsymAlloc : public DlSymAllocator<DlsymAlloc> {
static bool UseImpl() { return !__rtsan_is_initialized(); }
};
} // namespace

// See note in tsan as to why this is necessary
static pthread_cond_t *init_cond(pthread_cond_t *c, bool force = false) {
if (!common_flags()->legacy_pthread_cond)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The option is for DSO built against glibc 2.2.5 (released in 2002). For rtsan we can reasonably assume newer shared objects and don't allow this at all.

So, just delete this code path?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I delete this codepath, I get a crash on almost all tests on my mac, with this stack trace:

* thread #1, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x3cb0b1bb)
  * frame #0: 0x000000018d619e4c libsystem_pthread.dylib`pthread_cond_broadcast + 48
    frame #1: 0x000000018d54ae70 libc++.1.dylib`std::__1::__call_once(unsigned long volatile&, void*, void (*)(void*)) + 224
    frame #2: 0x000000018d582914 libc++.1.dylib`std::__1::locale::id::__get() + 104
    frame #3: 0x000000018d57fb94 libc++.1.dylib`std::__1::locale::__imp::__imp(unsigned long) + 148
    frame #4: 0x000000018d580c10 libc++.1.dylib`std::__1::locale::classic() + 84
    frame #5: 0x000000018d582140 libc++.1.dylib`std::__1::locale::__global() + 72
    frame #6: 0x000000018d5821a0 libc++.1.dylib`std::__1::locale::locale() + 24
    frame #7: 0x000000018d55875c libc++.1.dylib`std::__1::basic_streambuf<char, std::__1::char_traits<char>>::basic_streambuf() + 52
    frame #8: 0x000000018d56aac8 libc++.1.dylib`std::__1::__stdinbuf<char>::__stdinbuf(__sFILE*, __mbstate_t*) + 40
    frame #9: 0x000000018d56a45c libc++.1.dylib`std::__1::DoIOSInit::DoIOSInit() + 68
    frame #10: 0x000000018d56aa50 libc++.1.dylib`std::__1::ios_base::Init::Init() + 80
    frame #11: 0x000000018d56be70 libc++.1.dylib`_GLOBAL__I_000100 + 32
    frame #12: 0x000000018d2b105c dyld`invocation function for block in dyld4::Loader::findAndRunAllInitializers(dyld4::RuntimeState&) const::$_0::operator()() const + 168
    frame #13: 0x000000018d2ef308 dyld`invocation function for block in dyld3::MachOAnalyzer::forEachInitializer(Diagnostics&, dyld3::MachOAnalyzer::VMAddrConverter const&, void (unsigned int) block_pointer, void const*) const + 340
    frame #14: 0x000000018d2e299c dyld`invocation function for block in dyld3::MachOFile::forEachSection(void (dyld3::MachOFile::SectionInfo const&, bool, bool&) block_pointer) const + 496
    frame #15: 0x000000018d2922fc dyld`dyld3::MachOFile::forEachLoadCommand(Diagnostics&, void (load_command const*, bool&) block_pointer) const + 300
    frame #16: 0x000000018d2e1930 dyld`dyld3::MachOFile::forEachSection(void (dyld3::MachOFile::SectionInfo const&, bool, bool&) block_pointer) const + 192
    frame #17: 0x000000018d2eee1c dyld`dyld3::MachOAnalyzer::forEachInitializer(Diagnostics&, dyld3::MachOAnalyzer::VMAddrConverter const&, void (unsigned int) block_pointer, void const*) const + 516
    frame #18: 0x000000018d2ad070 dyld`dyld4::Loader::findAndRunAllInitializers(dyld4::RuntimeState&) const + 524
    frame #19: 0x000000018d2b6d28 dyld`dyld4::PrebuiltLoader::runInitializers(dyld4::RuntimeState&) const + 44
    frame #20: 0x000000018d2ad45c dyld`dyld4::Loader::runInitializersBottomUp(dyld4::RuntimeState&, dyld3::Array<dyld4::Loader const*>&) const + 220
    frame #21: 0x000000018d2ad400 dyld`dyld4::Loader::runInitializersBottomUp(dyld4::RuntimeState&, dyld3::Array<dyld4::Loader const*>&) const + 128
    frame #22: 0x000000018d2b10ec dyld`dyld4::Loader::runInitializersBottomUpPlusUpwardLinks(dyld4::RuntimeState&) const::$_1::operator()() const + 116
    frame #23: 0x000000018d2ad628 dyld`dyld4::Loader::runInitializersBottomUpPlusUpwardLinks(dyld4::RuntimeState&) const + 380
    frame #24: 0x000000018d2d04d8 dyld`dyld4::APIs::runAllInitializersForMain() + 464
    frame #25: 0x000000018d296f7c dyld`dyld4::prepare(dyld4::APIs&, dyld3::MachOAnalyzer const*) + 3156
    frame #26: 0x000000018d295edc dyld`start + 1844

Specifically, the pthread_cond_broadcast interceptor gets called, but seemingly our pthread_cond_init function is never called. This means:

INTERCEPTOR(int, pthread_cond_broadcast, pthread_cond_t *cond) {
  __rtsan_notify_intercepted_call("pthread_cond_broadcast");


  // This function returns a cond not initialized by us, 
  pthread_cond_t *c = init_cond(cond);

  // this call crashes, as it operates on a pthread_cond that wasn't initialized properly
  return REAL(pthread_cond_broadcast)(c);
}

Any thoughts on getting around this? I've tried a few things, but nothing seems to work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@MaskRay - any ideas here? I'm inclined to just leave it, as it seems like MacOS may use some kind of shared condition variable to access locale. (but I admit I don't fully understand it).

I've updated the interception pattern to match your work in #154268

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should assume common_flags()->legacy_pthread_cond is always false for rtsan and delete the unreachable code path. Can you double check macOS really crashes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I misunderstood your original comment. This does not crash mac.

See: #155181

@MaskRay
Copy link
Member

MaskRay commented Aug 19, 2025

I have created #154268 to simplify the tsan pthread_cond_t interceptors.

https://maskray.me/blog/2023-01-08-all-about-sanitizer-interceptors

Before glibc 2.36, dlsym(RTLD_NEXT, name) returned the address of the oldest version definition of name in libc.so.6. And we got to use the old semantics. This is usually benign but not ideal (see google/sanitizers#1371 for a regexec issue). I fixed glibc 2.36 (BZ #14932) so that dlsym(RTLD_NEXT, name) returns the default version definition now.

https://maskray.me/blog/2022-05-29-glibc

Take x86_64 as an example. sysdeps/unix/sysv/linux/x86_64/64/shlib-versions says that x86_64's earliest symbol is at GLIBC_2.2.5. $build/abi-versions.h contains:

@cjappl
Copy link
Contributor Author

cjappl commented Aug 19, 2025

I have created #154268 to simplify the tsan pthread_cond_t interceptors.

https://maskray.me/blog/2023-01-08-all-about-sanitizer-interceptors

Before glibc 2.36, dlsym(RTLD_NEXT, name) returned the address of the oldest version definition of name in libc.so.6. And we got to use the old semantics. This is usually benign but not ideal (see google/sanitizers#1371 for a regexec issue). I fixed glibc 2.36 (BZ #14932) so that dlsym(RTLD_NEXT, name) returns the default version definition now.

https://maskray.me/blog/2022-05-29-glibc

Take x86_64 as an example. sysdeps/unix/sysv/linux/x86_64/64/shlib-versions says that x86_64's earliest symbol is at GLIBC_2.2.5. $build/abi-versions.h contains:

Thanks @MaskRay ! After that's merged, I'll update to follow that pattern.

@cjappl cjappl force-pushed the cjappl/new_pthread_cond_init branch from 01fee83 to 73d3968 Compare August 21, 2025 15:03
@cjappl cjappl force-pushed the cjappl/new_pthread_cond_init branch from 73d3968 to 6d33293 Compare August 21, 2025 15:05
@cjappl cjappl merged commit aa4bc2e into llvm:main Aug 23, 2025
9 checks passed
@cjappl
Copy link
Contributor Author

cjappl commented Aug 23, 2025

After I get confirmation that the tests are stable on main, I will cherry pick this to the LLVM 21 release branch

CC @davidtrevelyan

@MaskRay
Copy link
Member

MaskRay commented Aug 24, 2025

legacy_pthread_cond is a legacy - we should remove it.

cjappl added a commit to cjappl/llvm-project that referenced this pull request Aug 29, 2025
cjappl added a commit that referenced this pull request Aug 29, 2025
…5963)

This reverts commit aa4bc2e.

As discussed on #155181 , this introduced some unneeded code. Reverting
and applying the smaller version of the fix
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

Real-time Sanitizer causes pthread_cond_signal to segfault when a thread is doing a timed wait on a condition variable
6 participants