How fast is thread local variable access on Linux How fast is thread local variable access on Linux multithreading multithreading

How fast is thread local variable access on Linux


However, I keep on reading horror stories about the slowness of thread local variable access. How come?

Let me demonstrate the slowness of thread local variable on Linux x86_64 with an example I have taken from http://software.intel.com/en-us/blogs/2011/05/02/the-hidden-performance-cost-of-accessing-thread-local-variables.

  1. No __thread variable, no slowness.

    I will use the performance of this test as a base.

        #include "stdio.h"    #include "math.h"    double tlvar;    //following line is needed so get_value() is not inlined by compiler    double get_value() __attribute__ ((noinline));    double get_value()    {      return tlvar;    }    int main()    {      int i;      double f=0.0;      tlvar = 1.0;      for(i=0; i<1000000000; i++)      {         f += sqrt(get_value());      }      printf("f = %f\n", f);      return 1;    }

    This is assembler code of get_value()

    Dump of assembler code for function get_value:=> 0x0000000000400560 <+0>:     movsd  0x200478(%rip),%xmm0        # 0x6009e0 <tlvar>   0x0000000000400568 <+8>:     retqEnd of assembler dump.

    This is how fast it runs:

    $ time ./inet_test_no_threadf = 1000000000.000000real    0m5.169suser    0m5.137ssys     0m0.002s
  2. There is __thread variable in an executable (not in shared library), still no slowness.

    #include "stdio.h"#include "math.h"__thread double tlvar;//following line is needed so get_value() is not inlined by compilerdouble get_value() __attribute__ ((noinline));double get_value(){  return tlvar;}int main(){  int i;  double f=0.0;  tlvar = 1.0;  for(i=0; i<1000000000; i++)  {    f += sqrt(get_value());  }  printf("f = %f\n", f);  return 1;}

    This is assembler code of get_value()

    (gdb) disassemble get_valueDump of assembler code for function get_value:=> 0x0000000000400590 <+0>:     movsd  %fs:0xfffffffffffffff8,%xmm0   0x000000000040059a <+10>:    retqEnd of assembler dump.

    This is how fast it runs:

    $ time ./inet_testf = 1000000000.000000real    0m5.232suser    0m5.158ssys     0m0.007s

    So, it is quite obvious that when __thread var is in the executable it is as fast as ordinary global variable.

  3. There is a __thread variable and it is in a shared library, there is slowness.

    Executable:

    $ cat inet_test_main.c#include "stdio.h"#include "math.h"int test();int main(){   test();   return 1;}

    Shared library:

    $ cat inet_test_lib.c#include "stdio.h"#include "math.h"static __thread double tlvar;//following line is needed so get_value() is not inlined by compilerdouble get_value() __attribute__ ((noinline));double get_value(){  return tlvar;}int test(){  int i;  double f=0.0;  tlvar = 1.0;  for(i=0; i<1000000000; i++)  {    f += sqrt(get_value());  }  printf("f = %f\n", f);  return 1;}

    This is assembler code of get_value(), see how different it is - it calls __tls_get_addr():

    Dump of assembler code for function get_value:=> 0x00007ffff7dfc6d0 <+0>:     lea    0x200329(%rip),%rdi        # 0x7ffff7ffca00   0x00007ffff7dfc6d7 <+7>:     callq  0x7ffff7dfc5c8 <__tls_get_addr@plt>   0x00007ffff7dfc6dc <+12>:    movsd  0x0(%rax),%xmm0   0x00007ffff7dfc6e4 <+20>:    retqEnd of assembler dump.(gdb) disas __tls_get_addrDump of assembler code for function __tls_get_addr:   0x0000003c40a114d0 <+0>:     push   %rbx   0x0000003c40a114d1 <+1>:     mov    %rdi,%rbx=> 0x0000003c40a114d4 <+4>:     mov    %fs:0x8,%rdi   0x0000003c40a114dd <+13>:    mov    0x20fa74(%rip),%rax        # 0x3c40c20f58 <_rtld_local+3928>   0x0000003c40a114e4 <+20>:    cmp    %rax,(%rdi)   0x0000003c40a114e7 <+23>:    jne    0x3c40a11505 <__tls_get_addr+53>   0x0000003c40a114e9 <+25>:    xor    %esi,%esi   0x0000003c40a114eb <+27>:    mov    (%rbx),%rdx   0x0000003c40a114ee <+30>:    mov    %rdx,%rax   0x0000003c40a114f1 <+33>:    shl    $0x4,%rax   0x0000003c40a114f5 <+37>:    mov    (%rax,%rdi,1),%rax   0x0000003c40a114f9 <+41>:    cmp    $0xffffffffffffffff,%rax   0x0000003c40a114fd <+45>:    je     0x3c40a1151b <__tls_get_addr+75>   0x0000003c40a114ff <+47>:    add    0x8(%rbx),%rax   0x0000003c40a11503 <+51>:    pop    %rbx   0x0000003c40a11504 <+52>:    retq   0x0000003c40a11505 <+53>:    mov    (%rbx),%rdi   0x0000003c40a11508 <+56>:    callq  0x3c40a11200 <_dl_update_slotinfo>   0x0000003c40a1150d <+61>:    mov    %rax,%rsi   0x0000003c40a11510 <+64>:    mov    %fs:0x8,%rdi   0x0000003c40a11519 <+73>:    jmp    0x3c40a114eb <__tls_get_addr+27>   0x0000003c40a1151b <+75>:    callq  0x3c40a11000 <tls_get_addr_tail>   0x0000003c40a11520 <+80>:    jmp    0x3c40a114ff <__tls_get_addr+47>End of assembler dump.

    It runs almost twice slower !:

    $ time ./inet_test_mainf = 1000000000.000000real    0m9.978suser    0m9.906ssys     0m0.004s

    And finally - this is what perf reports - __tls_get_addr - 21% of CPU utilization:

    $ perf report --stdio## Events: 10K cpu-clock## Overhead         Command        Shared Object              Symbol# ........  ..............  ...................  ..................#    58.05%  inet_test_main  libinet_test_lib.so  [.] test    21.15%  inet_test_main  ld-2.12.so           [.] __tls_get_addr    10.69%  inet_test_main  libinet_test_lib.so  [.] get_value     5.07%  inet_test_main  libinet_test_lib.so  [.] get_value@plt     4.82%  inet_test_main  libinet_test_lib.so  [.] __tls_get_addr@plt     0.23%  inet_test_main  [kernel.kallsyms]    [k] 0xffffffffa0165b75

So, as you can see when a thread local variable is in a shared library (declared static and used only in a shared library) it is rather slow. If a thread local variable in a shared library is accessed rarely, then it is not a problem for performace. If it is used quite often like in this test then the overhead will be significant.

The document http://www.akkadia.org/drepper/tls.pdf which is mentioned in the comments talks about four possible TLS access models. Frankly, I don't understand when "Initial exec TLS model" is used, but as for the other three models it is possible to avoid calling __tls_get_addr() only when __thread variable is in an executable and is accessed from the executable.


How fast is accessing a thread local variables in Linux

It depends, on a lot of things.

Some processors (i*86) have special segment (fs, or gs in x86_64 mode). Other processors do not (but usually they will have a register reserved for accessing current thread, and TLS is easy to find using that dedicated register).

On i*86, using fs, the access is almost as fast as direct memory access.

I keep on reading horror stories about the slowness of thread local variable access

It would have helped if you provided links to some such horror stories. Without the links, it's impossible to tell whether their authors know what they are talking about.