I'm pretty new to performance measurement. I came across this question and decided to check it myself. Here is how my benchmarks look like:
For stack:
section .text
    global _start
_start:
    mov r12, 0xFFFFFFFF
    push 0xFFFFFF
    mov_loop:
        mov rax, [rsp]
        dec r12
        jnz mov_loop
    mov rax, 60
    syscall
For heap:
SYS_brk equ 0x0C
section .text
    global _start
_start:
    mov rax, SYS_brk
    mov rdi, 0
    syscall
    ;allocate 8 bytes
    mov r10, rax
    mov rax, SYS_brk
    mov rdi, r10
    add rdi, 0x08
    syscall
    mov [r10], dword 0xFFFFFF
    mov rcx, 0xFFFFFFFF
    heap_loop:
        mov rax, [r10]
        dec rcx
        jnz heap_loop
    ;release memory
    mov rax, SYS_brk
    mov rdi, r10
    syscall
    mov rax, 60
    syscall
Runnning benchmarks with perf stat -d -r 10 showed that I actually measured L1-cache-loads in both of the cases.
 4,295,747,868      L1-dcache-loads           # 2996.483 M/sec                    ( +-  0.00% )
        48,316      L1-dcache-load-misses     #    0.00% of all L1-dcache hits    ( +- 18.42% )
Is there a way to invalidate cache lines before each iteration started?
