Why is writing to memory much slower than reading it?

With your programs, I get

(write) Bandwidth =  6.076 GB/s
(read)  Bandwidth = 10.916 GB/s

on a desktop (Core i7, x86-64, GCC 4.9, GNU libc 2.19) machine with six 2GB DIMMs. (I don’t have any more detail than that to hand, sorry.)

However, this program reports write bandwidth of 12.209 GB/s:

#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <emmintrin.h>

static void
nt_memset(char *buf, unsigned char val, size_t n)
{
    /* this will only work with aligned address and size */
    assert((uintptr_t)buf % sizeof(__m128i) == 0);
    assert(n % sizeof(__m128i) == 0);

    __m128i xval = _mm_set_epi8(val, val, val, val,
                                val, val, val, val,
                                val, val, val, val,
                                val, val, val, val);

    for (__m128i *p = (__m128i*)buf; p < (__m128i*)(buf + n); p++)
        _mm_stream_si128(p, xval);
    _mm_sfence();
}

/* same main() as your write test, except calling nt_memset instead of memset */

The magic is all in _mm_stream_si128, aka the machine instruction movntdq, which writes a 16-byte quantity to system RAM, bypassing the cache (the official jargon for this is “non-temporal store”). I think this pretty conclusively demonstrates that the performance difference is all about the cache behavior.

N.B. glibc 2.19 does have an elaborately hand-optimized memset that makes use of vector instructions. However, it does not use non-temporal stores. That’s probably the Right Thing for memset; in general, you clear memory shortly before using it, so you want it to be hot in the cache. (I suppose an even cleverer memset might switch to non-temporal stores for really huge block clear, on the theory that you could not possibly want all of that in the cache, because the cache simply isn’t that big.)

Dump of assembler code for function memset:
=> 0x00007ffff7ab9420 <+0>:     movd   %esi,%xmm8
   0x00007ffff7ab9425 <+5>:     mov    %rdi,%rax
   0x00007ffff7ab9428 <+8>:     punpcklbw %xmm8,%xmm8
   0x00007ffff7ab942d <+13>:    punpcklwd %xmm8,%xmm8
   0x00007ffff7ab9432 <+18>:    pshufd $0x0,%xmm8,%xmm8
   0x00007ffff7ab9438 <+24>:    cmp    $0x40,%rdx
   0x00007ffff7ab943c <+28>:    ja     0x7ffff7ab9470 <memset+80>
   0x00007ffff7ab943e <+30>:    cmp    $0x10,%rdx
   0x00007ffff7ab9442 <+34>:    jbe    0x7ffff7ab94e2 <memset+194>
   0x00007ffff7ab9448 <+40>:    cmp    $0x20,%rdx
   0x00007ffff7ab944c <+44>:    movdqu %xmm8,(%rdi)
   0x00007ffff7ab9451 <+49>:    movdqu %xmm8,-0x10(%rdi,%rdx,1)
   0x00007ffff7ab9458 <+56>:    ja     0x7ffff7ab9460 <memset+64>
   0x00007ffff7ab945a <+58>:    repz retq 
   0x00007ffff7ab945c <+60>:    nopl   0x0(%rax)
   0x00007ffff7ab9460 <+64>:    movdqu %xmm8,0x10(%rdi)
   0x00007ffff7ab9466 <+70>:    movdqu %xmm8,-0x20(%rdi,%rdx,1)
   0x00007ffff7ab946d <+77>:    retq   
   0x00007ffff7ab946e <+78>:    xchg   %ax,%ax
   0x00007ffff7ab9470 <+80>:    lea    0x40(%rdi),%rcx
   0x00007ffff7ab9474 <+84>:    movdqu %xmm8,(%rdi)
   0x00007ffff7ab9479 <+89>:    and    $0xffffffffffffffc0,%rcx
   0x00007ffff7ab947d <+93>:    movdqu %xmm8,-0x10(%rdi,%rdx,1)
   0x00007ffff7ab9484 <+100>:   movdqu %xmm8,0x10(%rdi)
   0x00007ffff7ab948a <+106>:   movdqu %xmm8,-0x20(%rdi,%rdx,1)
   0x00007ffff7ab9491 <+113>:   movdqu %xmm8,0x20(%rdi)
   0x00007ffff7ab9497 <+119>:   movdqu %xmm8,-0x30(%rdi,%rdx,1)
   0x00007ffff7ab949e <+126>:   movdqu %xmm8,0x30(%rdi)
   0x00007ffff7ab94a4 <+132>:   movdqu %xmm8,-0x40(%rdi,%rdx,1)
   0x00007ffff7ab94ab <+139>:   add    %rdi,%rdx
   0x00007ffff7ab94ae <+142>:   and    $0xffffffffffffffc0,%rdx
   0x00007ffff7ab94b2 <+146>:   cmp    %rdx,%rcx
   0x00007ffff7ab94b5 <+149>:   je     0x7ffff7ab945a <memset+58>
   0x00007ffff7ab94b7 <+151>:   nopw   0x0(%rax,%rax,1)
   0x00007ffff7ab94c0 <+160>:   movdqa %xmm8,(%rcx)
   0x00007ffff7ab94c5 <+165>:   movdqa %xmm8,0x10(%rcx)
   0x00007ffff7ab94cb <+171>:   movdqa %xmm8,0x20(%rcx)
   0x00007ffff7ab94d1 <+177>:   movdqa %xmm8,0x30(%rcx)
   0x00007ffff7ab94d7 <+183>:   add    $0x40,%rcx
   0x00007ffff7ab94db <+187>:   cmp    %rcx,%rdx
   0x00007ffff7ab94de <+190>:   jne    0x7ffff7ab94c0 <memset+160>
   0x00007ffff7ab94e0 <+192>:   repz retq 
   0x00007ffff7ab94e2 <+194>:   movq   %xmm8,%rcx
   0x00007ffff7ab94e7 <+199>:   test   $0x18,%dl
   0x00007ffff7ab94ea <+202>:   jne    0x7ffff7ab950e <memset+238>
   0x00007ffff7ab94ec <+204>:   test   $0x4,%dl
   0x00007ffff7ab94ef <+207>:   jne    0x7ffff7ab9507 <memset+231>
   0x00007ffff7ab94f1 <+209>:   test   $0x1,%dl
   0x00007ffff7ab94f4 <+212>:   je     0x7ffff7ab94f8 <memset+216>
   0x00007ffff7ab94f6 <+214>:   mov    %cl,(%rdi)
   0x00007ffff7ab94f8 <+216>:   test   $0x2,%dl
   0x00007ffff7ab94fb <+219>:   je     0x7ffff7ab945a <memset+58>
   0x00007ffff7ab9501 <+225>:   mov    %cx,-0x2(%rax,%rdx,1)
   0x00007ffff7ab9506 <+230>:   retq   
   0x00007ffff7ab9507 <+231>:   mov    %ecx,(%rdi)
   0x00007ffff7ab9509 <+233>:   mov    %ecx,-0x4(%rdi,%rdx,1)
   0x00007ffff7ab950d <+237>:   retq   
   0x00007ffff7ab950e <+238>:   mov    %rcx,(%rdi)
   0x00007ffff7ab9511 <+241>:   mov    %rcx,-0x8(%rdi,%rdx,1)
   0x00007ffff7ab9516 <+246>:   retq   

(This is in libc.so.6, not the program itself — the other person who tried to dump the assembly for memset seems only to have found its PLT entry. The easiest way to get the assembly dump for the real memset on a Unixy system is

$ gdb ./a.out
(gdb) set env LD_BIND_NOW t
(gdb) b main
Breakpoint 1 at [address]
(gdb) r
Breakpoint 1, [address] in main ()
(gdb) disas memset
...

.)

Leave a Comment

Hata!: SQLSTATE[HY000] [1045] Access denied for user 'divattrend_liink'@'localhost' (using password: YES)