I'm trying to manually benchmark the time taken to read the elements from the array, So I created an assembly code that reads from the array passed.
Sample assembly code:
; void array_read(long n, double *A);
; n must be divisible by 64
section .text
%ifidn __OUTPUT_FORMAT__, macho64
%define array_read _array_read
%endif
global array_read
array_read:
        ;   rdi = n
        ;   rsi = A
        ;   rax = i
        push rbx
        mov rax, 0
        align 16
.main_loop:
        vmovaps ymm0,  [rsi+8*(rax+ 0)]
        vmovaps ymm1,  [rsi+8*(rax+ 4)]
        vmovaps ymm2,  [rsi+8*(rax+ 8)]
        vmovaps ymm3,  [rsi+8*(rax+12)]
        add rax, 64
        cmp rax, rdi
        jl .main_loop
.epilog:
        ; Restore caller registers
        pop rbx
        ret
I'm lucky in MacOS(Intel) it aligns correctly and provides the required output but in Linux it gives
Program received signal SIGSEGV: Segmentation fault - invalid memory reference.
Backtrace for this error:
#0  0x7fe37cdee20f in ???
#1  0x557940d91600 in ???
Segmentation fault (core dumped)
Why Segmentation fault?
 
     
    