C++ TLS on macOS
Environment: clang version 7.0.0; -target
x86_64-apple-macosx10.11.0
Function-scope thread_local
mechanics
Trivial types
trivial.cpp
struct trivial {};
auto get_trivial () noexcept -> trivial * {
thread_local trivial some_trivial ;
return & some_trivial ;
}
clang++ -O2 -S trivial.cpp
get_trivial ():
pushq %rbp
movq %rsp , %rbp
movq get_trivial ():: some_trivial@TLVP ( %rip ), %rdi
callq *( %rdi )
popq %rbp
retq
Types with constructors
ctor.cpp
auto keep ( void * ) noexcept -> void ;
struct ctor {
explicit ctor () noexcept {
keep ( this );
}
};
auto get_ctor () noexcept -> ctor * {
thread_local ctor some_ctor ;
return & some_ctor ;
}
clang++ -O2 -S ctor.cpp
get_ctor ():
pushq %rbp
movq %rsp , %rbp
movq guard variable for get_ctor ():: some_ctor@TLVP ( %rip ), %rdi
callq *( %rdi )
cmpb $0 , ( %rax )
je .initialize
.done_initializing:
movq get_ctor ():: some_ctor@TLVP ( %rip ), %rdi
callq *( %rdi )
popq %rbp
retq
.initialize:
movq get_ctor ():: some_ctor@TLVP ( %rip ), %rdi
callq *( %rdi )
movq %rax , %rdi
callq keep ( void *)
movq guard variable for get_ctor ():: some_ctor@TLVP ( %rip ), %rdi
callq *( %rdi )
movb $1 , ( %rax )
jmp .done_initializing
ctor_manual.cpp
#include <new>
#include <type_traits>
auto get_ctor_manual () noexcept -> ctor * {
thread_local bool some_ctor_guard { false };
thread_local std :: aligned_storage_t < sizeof ( ctor ), alignof ( ctor ) > some_ctor_storage ;
ctor * storage { reinterpret_cast < ctor *> ( & some_ctor_storage )};
if ( ! __builtin_expect ( some_ctor_guard , true )) {
new ( storage ) ctor {};
some_ctor_guard = true ;
}
return storage ;
}
Types with destructors
dtor.cpp
auto keep ( void * ) noexcept -> void ;
struct dtor {
~ dtor () {
keep ( this );
}
};
auto get_dtor () noexcept -> dtor * {
thread_local dtor some_dtor ;
return & some_dtor ;
}
clang++ -O2 -S dtor.cpp
get_dtor ():
pushq %rbp
movq %rsp , %rbp
movq guard variable for get_dtor ():: some_dtor@TLVP ( %rip ), %rdi
callq *( %rdi )
cmpb $0 , ( %rax )
je .initialize
.done_initializing:
movq get_dtor ():: some_dtor@TLVP ( %rip ), %rdi
callq *( %rdi )
popq %rbp
retq
.initialize:
movq get_dtor ():: some_dtor@TLVP ( %rip ), %rdi
callq *( %rdi )
movq dtor :: ~ dtor () @ GOTPCREL ( %rip ), %rdi
leaq ___dso_handle ( %rip ), %rdx
movq %rax , %rsi
callq __tlv_atexit
movq guard variable for get_dtor ():: some_dtor@TLVP ( %rip ), %rdi
callq *( %rdi )
movb $1 , ( %rax )
jmp .done_initializing
dtor_manual.cpp
#include <cstdint>
#include <new>
#include <type_traits>
auto get_dtor_manual () noexcept -> dtor * {
thread_local bool some_dtor_guard { false };
thread_local std :: aligned_storage_t < sizeof ( dtor ), alignof ( dtor ) > some_dtor_storage ;
dtor * storage { reinterpret_cast < dtor *> ( & some_dtor_storage )};
if ( ! __builtin_expect ( some_dtor_guard , true )) {
new ( storage ) dtor {};
_tlv_atexit ([]( void * opaque ) noexcept {
static_cast < dtor *> ( opaque ) ->~ dtor ();
}, storage , & __dso_handle );
some_dtor_guard = true ;
}
return storage ;
}
Bugs
Clang's code generation for C++ thread_local
variables is
poor on macOS.
Even with -O2 (optimizations enabled), the value of a
thread-local variable is not cached in a register. It's not cached even
if we disable inlining of the function which declares the variable and
returns its value. In the following example, the code generated for
last_data_char
reads the thread-local variable twice itself
[1] [2] (in addition to calling get_data
once [3]).
A thread-local variable with a non-trivial constructor allocates two
entries in thread-local storage: one for the guard variable and one for
the variable itself. Because these two entries are independent, they must
be loaded separately. In the following example, the code generated for
get_data
calls TLVP trampoilines three times in order to
initialize s
[4] [5] [6].
tls_example.cpp
#include <string>
namespace {
__attribute__ (( noinline ))
std :: string & get_data () noexcept {
thread_local std :: string s ;
return s ;
}
}
auto append_data ( char c ) -> void {
get_data (). push_back ( c );
}
auto last_data_char () noexcept -> char {
std :: string & s { get_data ()};
if ( s . empty ()) {
return '\0' ;
}
return s [ s . size () - 1 ];
}
clang++ -O2 -S tls_example.cpp
( anonymous namespace ):: get_data ():
pushq %rbp
movq %rsp , %rbp
## [4]:
movq guard variable for ( anonymous namespace ):: get_data ():: s@TLVP ( %rip ), %rdi
callq *( %rdi )
cmpb $0 , ( %rax )
je .initialize_s
popq %rbp
retq
.initialize_s:
## [5]:
movq ( anonymous namespace ):: get_data ():: s@TLVP ( %rip ), %rdi
callq *( %rdi )
movq $0 , 16 ( %rax )
movq $0 , 8 ( %rax )
movq $0 , ( %rax )
movq std :: __1 :: basic_string < char , std :: __1 :: char_traits < char > , std :: __1 :: allocator < char > > :: ~ basic_string () @ GOTPCREL ( %rip ), %rdi
leaq ___dso_handle ( %rip ), %rdx
movq %rax , %rsi
callq __tlv_atexit
## [6]:
movq guard variable for ( anonymous namespace ):: get_data ():: s@TLVP ( %rip ), %rdi
callq *( %rdi )
movb $1 , ( %rax )
popq %rbp
retq
last_data_char ():
pushq %rbp
movq %rsp , %rbp
## [3]:
callq ( anonymous namespace ):: get_data ()
## [1]:
movq ( anonymous namespace ):: get_data ():: s@TLVP ( %rip ), %rdi
callq *( %rdi )
movzbl ( %rax ), %edx
testb $1 , %dl
je LBB2_1
movq 8 ( %rax ), %rcx
testq %rcx , %rcx
je LBB2_4
LBB2_5:
## [2]:
movq ( anonymous namespace ):: get_data ():: s@TLVP ( %rip ), %rdi
callq *( %rdi )
testb $1 , %dl
je LBB2_6
movq 16 ( %rax ), %rax
jmp LBB2_8
LBB2_1:
movq %rdx , %rcx
shrq %rcx
testq %rcx , %rcx
jne LBB2_5
LBB2_4:
xorl %eax , %eax
jmp LBB2_9
LBB2_6:
incq %rax
LBB2_8:
movb - 1 ( %rax , %rcx ), %al
LBB2_9:
movsbl %al , %eax
popq %rbp
retq