strager.net

C++ TLS on macOS

Environment: clang version 7.0.0; -target x86_64-apple-macosx10.11.0

Function-scope thread_local mechanics

Trivial types

trivial.cpp
struct trivial {};

auto get_trivial() noexcept -> trivial * {
  thread_local trivial some_trivial;
  return &some_trivial;
}
clang++ -O2 -S trivial.cpp
get_trivial():
        pushq   %rbp
        movq    %rsp, %rbp
        movq    get_trivial()::some_trivial@TLVP(%rip), %rdi
        callq   *(%rdi)
        popq    %rbp
        retq

Types with constructors

ctor.cpp
auto keep(void *) noexcept -> void;

struct ctor {
  explicit ctor() noexcept {
    keep(this);
  }
};

auto get_ctor() noexcept -> ctor * {
  thread_local ctor some_ctor;
  return &some_ctor;
}
clang++ -O2 -S ctor.cpp
get_ctor():
        pushq   %rbp
        movq    %rsp, %rbp
        movq    guard variable for get_ctor()::some_ctor@TLVP(%rip), %rdi
        callq   *(%rdi)
        cmpb    $0, (%rax)
        je      .initialize
.done_initializing:
        movq    get_ctor()::some_ctor@TLVP(%rip), %rdi
        callq   *(%rdi)
        popq    %rbp
        retq
.initialize:
        movq    get_ctor()::some_ctor@TLVP(%rip), %rdi
        callq   *(%rdi)
        movq    %rax, %rdi
        callq   keep(void*)
        movq    guard variable for get_ctor()::some_ctor@TLVP(%rip), %rdi
        callq   *(%rdi)
        movb    $1, (%rax)
        jmp     .done_initializing
ctor_manual.cpp
#include <new>
#include <type_traits>

auto get_ctor_manual() noexcept -> ctor * {
  thread_local bool some_ctor_guard{false};
  thread_local std::aligned_storage_t<sizeof(ctor), alignof(ctor)> some_ctor_storage;
  ctor *storage{reinterpret_cast<ctor *>(&some_ctor_storage)};
  if (!__builtin_expect(some_ctor_guard, true)) {
    new (storage) ctor{};
    some_ctor_guard = true;
  }
  return storage;
}

Types with destructors

dtor.cpp
auto keep(void *) noexcept -> void;

struct dtor {
  ~dtor() {
    keep(this);
  }
};

auto get_dtor() noexcept -> dtor * {
  thread_local dtor some_dtor;
  return &some_dtor;
}
clang++ -O2 -S dtor.cpp
get_dtor():
        pushq   %rbp
        movq    %rsp, %rbp
        movq    guard variable for get_dtor()::some_dtor@TLVP(%rip), %rdi
        callq   *(%rdi)
        cmpb    $0, (%rax)
        je      .initialize
.done_initializing:
        movq    get_dtor()::some_dtor@TLVP(%rip), %rdi
        callq   *(%rdi)
        popq    %rbp
        retq
.initialize:
        movq    get_dtor()::some_dtor@TLVP(%rip), %rdi
        callq   *(%rdi)
        movq    dtor::~dtor()@GOTPCREL(%rip), %rdi
        leaq    ___dso_handle(%rip), %rdx
        movq    %rax, %rsi
        callq   __tlv_atexit
        movq    guard variable for get_dtor()::some_dtor@TLVP(%rip), %rdi
        callq   *(%rdi)
        movb    $1, (%rax)
        jmp     .done_initializing
dtor_manual.cpp
#include <cstdint>
#include <new>
#include <type_traits>

auto get_dtor_manual() noexcept -> dtor * {
  thread_local bool some_dtor_guard{false};
  thread_local std::aligned_storage_t<sizeof(dtor), alignof(dtor)> some_dtor_storage;
  dtor *storage{reinterpret_cast<dtor *>(&some_dtor_storage)};
  if (!__builtin_expect(some_dtor_guard, true)) {
    new (storage) dtor{};
    _tlv_atexit([](void *opaque) noexcept {
      static_cast<dtor *>(opaque)->~dtor();
    }, storage, &__dso_handle);
    some_dtor_guard = true;
  }
  return storage;
}

Bugs

Clang's code generation for C++ thread_local variables is poor on macOS.

tls_example.cpp
#include <string>

namespace {
__attribute__((noinline))
std::string &get_data() noexcept {
  thread_local std::string s;
  return s;
}
}

auto append_data(char c) -> void {
  get_data().push_back(c);
}

auto last_data_char() noexcept -> char {
  std::string &s{get_data()};
  if (s.empty()) {
    return '\0';
  }
  return s[s.size() - 1];
}
clang++ -O2 -S tls_example.cpp
(anonymous namespace)::get_data():
        pushq   %rbp
        movq    %rsp, %rbp
## [4]:
        movq    guard variable for (anonymous namespace)::get_data()::s@TLVP(%rip), %rdi
        callq   *(%rdi)
        cmpb    $0, (%rax)
        je      .initialize_s
        popq    %rbp
        retq
.initialize_s:
## [5]:
        movq    (anonymous namespace)::get_data()::s@TLVP(%rip), %rdi
        callq   *(%rdi)
        movq    $0, 16(%rax)
        movq    $0, 8(%rax)
        movq    $0, (%rax)
        movq    std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >::~basic_string()@GOTPCREL(%rip), %rdi
        leaq    ___dso_handle(%rip), %rdx
        movq    %rax, %rsi
        callq   __tlv_atexit
## [6]:
        movq    guard variable for (anonymous namespace)::get_data()::s@TLVP(%rip), %rdi
        callq   *(%rdi)
        movb    $1, (%rax)
        popq    %rbp
        retq

last_data_char():
        pushq   %rbp
        movq    %rsp, %rbp
## [3]:
        callq   (anonymous namespace)::get_data()
## [1]:
        movq    (anonymous namespace)::get_data()::s@TLVP(%rip), %rdi
        callq   *(%rdi)
        movzbl  (%rax), %edx
        testb   $1, %dl
        je      LBB2_1
        movq    8(%rax), %rcx
        testq   %rcx, %rcx
        je      LBB2_4
LBB2_5:
## [2]:
        movq    (anonymous namespace)::get_data()::s@TLVP(%rip), %rdi
        callq   *(%rdi)
        testb   $1, %dl
        je      LBB2_6
        movq    16(%rax), %rax
        jmp     LBB2_8
LBB2_1:
        movq    %rdx, %rcx
        shrq    %rcx
        testq   %rcx, %rcx
        jne     LBB2_5
LBB2_4:
        xorl    %eax, %eax
        jmp     LBB2_9
LBB2_6:
        incq    %rax
LBB2_8:
        movb    -1(%rax,%rcx), %al
LBB2_9:
        movsbl  %al, %eax
        popq    %rbp
        retq