I am working with the matrix-matrix multiplication and I noticed that if I use 2D arrays, A[M][N], insted of 1D arrays to store a matrix, the access to that takes less time against the locality principle. Here there is the code where the time using linearised matrix is greater than the second one. Why ?
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
void matrixMul(double* m1, double* m2, double* m3, int N) {
    int i, j, k;
    
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            for (k = 0; k < N; k++) { 
                m3[i * N + j] += m1[i*N+k] * m2[k*N+j];
            }
        }
    }
}
void matrixlin(double** m1, double** m2, double** m3, int N) {
    int i, j, k;
    
    for (i = 0; i < N; i++) {
        for (j = 0; j < N; j++) {
            for (k = 0; k < N; k++) { 
                m3[i][j] += m1[i][k] * m2[k][j];
            }
        }
    }
}
int main(int argc, char* argv[]) {
    int N = 1024;
    int i, j;
    
    double *m1 = (double *)malloc(N*N*sizeof(double));
    double *m2 = (double *)malloc(N*N*sizeof(double));
    double *m3 = (double *)malloc(N*N*sizeof(double));
    memset(m3, 0, N * N * sizeof(double)); 
    
    double **mm1 = (double **)malloc(N*sizeof(double*));
    double **mm2 = (double **)malloc(N*sizeof(double*));
    double **mm3 = (double **)malloc(N*sizeof(double*));
    for(i=0; i<N; i++){
        mm1[i]=(double *)malloc(N*sizeof(double));
        mm2[i]=(double *)malloc(N*sizeof(double));
        mm3[i]=(double *)malloc(N*sizeof(double));
        memset(mm3[i], 0,  N * sizeof(double)); 
    }
    
    for(i=0; i<N; i++){
        for(j=0; j<N; j++){
            m1[i * N + j] = 1.1; m2[i * N + j]=2.1;
            mm1[i][j] = 1.1; mm2[i][j] = 2.1;
        }
    }
    
    clock_t t1 = clock();
    matrixMul(m1, m2, m3, N);
    t1 = clock() - t1;
    printf("Elapsed time linearized: %.5f seconds\n", ((double)t1)/CLOCKS_PER_SEC);
    
    clock_t t2 = clock();
    matrixlin(mm1, mm2, mm3, N);
    t2 = clock() - t2;
    printf("Elapsed time 2D array: %.5f seconds\n", ((double)t2)/CLOCKS_PER_SEC);
    
    
    free(m1);
    free(m2);
    free(m3);
    for(i=0; i<N; i++){
        free(mm1[i]);
        free(mm2[i]);
        free(mm3[i]);
    }
    free(mm1);
    free(mm2);
    free(mm3);
    return 0;
}
Output:
$ gcc test.c
$ ./a.out
Elapsed time linearized: 22.40697 seconds
Elapsed time 2D array: 7.61103 seconds
Assembly code (gcc -S):
    .file   "test.c"
    .text
    .globl  matrixMul
    .type   matrixMul, @function
matrixMul:
.LFB6:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -24(%rbp)
    movq    %rsi, -32(%rbp)
    movq    %rdx, -40(%rbp)
    movl    %ecx, -44(%rbp)
    movl    $0, -12(%rbp)
    jmp .L2
.L7:
    movl    $0, -8(%rbp)
    jmp .L3
.L6:
    movl    $0, -4(%rbp)
    jmp .L4
.L5:
    movl    -12(%rbp), %eax
    imull   -44(%rbp), %eax
    movl    %eax, %edx
    movl    -8(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm1
    movl    -12(%rbp), %eax
    imull   -44(%rbp), %eax
    movl    %eax, %edx
    movl    -4(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -24(%rbp), %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm2
    movl    -4(%rbp), %eax
    imull   -44(%rbp), %eax
    movl    %eax, %edx
    movl    -8(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -32(%rbp), %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm0
    mulsd   %xmm2, %xmm0
    movl    -12(%rbp), %eax
    imull   -44(%rbp), %eax
    movl    %eax, %edx
    movl    -8(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    addsd   %xmm1, %xmm0
    movsd   %xmm0, (%rax)
    addl    $1, -4(%rbp)
.L4:
    movl    -4(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L5
    addl    $1, -8(%rbp)
.L3:
    movl    -8(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L6
    addl    $1, -12(%rbp)
.L2:
    movl    -12(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L7
    nop
    nop
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE6:
    .size   matrixMul, .-matrixMul
    .globl  matrixlin
    .type   matrixlin, @function
matrixlin:
.LFB7:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    movq    %rdi, -24(%rbp)
    movq    %rsi, -32(%rbp)
    movq    %rdx, -40(%rbp)
    movl    %ecx, -44(%rbp)
    movl    $0, -12(%rbp)
    jmp .L9
.L14:
    movl    $0, -8(%rbp)
    jmp .L10
.L13:
    movl    $0, -4(%rbp)
    jmp .L11
.L12:
    movl    -12(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -8(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm1
    movl    -12(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -24(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -4(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm2
    movl    -4(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -32(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -8(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   (%rax), %xmm0
    mulsd   %xmm2, %xmm0
    movl    -12(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -8(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    addsd   %xmm1, %xmm0
    movsd   %xmm0, (%rax)
    addl    $1, -4(%rbp)
.L11:
    movl    -4(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L12
    addl    $1, -8(%rbp)
.L10:
    movl    -8(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L13
    addl    $1, -12(%rbp)
.L9:
    movl    -12(%rbp), %eax
    cmpl    -44(%rbp), %eax
    jl  .L14
    nop
    nop
    popq    %rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE7:
    .size   matrixlin, .-matrixlin
    .section    .rodata
    .align 8
.LC3:
    .string "Elapsed time linearized: %.5f seconds\n"
    .align 8
.LC4:
    .string "Elapsed time 2D array: %.5f seconds\n"
    .text
    .globl  main
    .type   main, @function
main:
.LFB8:
    .cfi_startproc
    endbr64
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    pushq   %rbx
    subq    $104, %rsp
    .cfi_offset 3, -24
    movl    %edi, -100(%rbp)
    movq    %rsi, -112(%rbp)
    movl    $1024, -84(%rbp)
    movl    -84(%rbp), %eax
    imull   %eax, %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -80(%rbp)
    movl    -84(%rbp), %eax
    imull   %eax, %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -72(%rbp)
    movl    -84(%rbp), %eax
    imull   %eax, %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -64(%rbp)
    movl    -84(%rbp), %eax
    imull   %eax, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -64(%rbp), %rax
    movl    $0, %esi
    movq    %rax, %rdi
    call    memset@PLT
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -56(%rbp)
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -48(%rbp)
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, -40(%rbp)
    movl    $0, -92(%rbp)
    jmp .L16
.L17:
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movl    -92(%rbp), %edx
    movslq  %edx, %rdx
    leaq    0(,%rdx,8), %rcx
    movq    -56(%rbp), %rdx
    leaq    (%rcx,%rdx), %rbx
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, (%rbx)
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movl    -92(%rbp), %edx
    movslq  %edx, %rdx
    leaq    0(,%rdx,8), %rcx
    movq    -48(%rbp), %rdx
    leaq    (%rcx,%rdx), %rbx
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, (%rbx)
    movl    -84(%rbp), %eax
    cltq
    salq    $3, %rax
    movl    -92(%rbp), %edx
    movslq  %edx, %rdx
    leaq    0(,%rdx,8), %rcx
    movq    -40(%rbp), %rdx
    leaq    (%rcx,%rdx), %rbx
    movq    %rax, %rdi
    call    malloc@PLT
    movq    %rax, (%rbx)
    movl    -84(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rcx
    movq    -40(%rbp), %rax
    addq    %rcx, %rax
    movq    (%rax), %rax
    movl    $0, %esi
    movq    %rax, %rdi
    call    memset@PLT
    addl    $1, -92(%rbp)
.L16:
    movl    -92(%rbp), %eax
    cmpl    -84(%rbp), %eax
    jl  .L17
    movl    $0, -92(%rbp)
    jmp .L18
.L21:
    movl    $0, -88(%rbp)
    jmp .L19
.L20:
    movl    -92(%rbp), %eax
    imull   -84(%rbp), %eax
    movl    %eax, %edx
    movl    -88(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -80(%rbp), %rax
    addq    %rdx, %rax
    movsd   .LC0(%rip), %xmm0
    movsd   %xmm0, (%rax)
    movl    -92(%rbp), %eax
    imull   -84(%rbp), %eax
    movl    %eax, %edx
    movl    -88(%rbp), %eax
    addl    %edx, %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -72(%rbp), %rax
    addq    %rdx, %rax
    movsd   .LC1(%rip), %xmm0
    movsd   %xmm0, (%rax)
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -56(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -88(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   .LC0(%rip), %xmm0
    movsd   %xmm0, (%rax)
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -48(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rdx
    movl    -88(%rbp), %eax
    cltq
    salq    $3, %rax
    addq    %rdx, %rax
    movsd   .LC1(%rip), %xmm0
    movsd   %xmm0, (%rax)
    addl    $1, -88(%rbp)
.L19:
    movl    -88(%rbp), %eax
    cmpl    -84(%rbp), %eax
    jl  .L20
    addl    $1, -92(%rbp)
.L18:
    movl    -92(%rbp), %eax
    cmpl    -84(%rbp), %eax
    jl  .L21
    call    clock@PLT
    movq    %rax, -32(%rbp)
    movl    -84(%rbp), %ecx
    movq    -64(%rbp), %rdx
    movq    -72(%rbp), %rsi
    movq    -80(%rbp), %rax
    movq    %rax, %rdi
    call    matrixMul
    call    clock@PLT
    subq    -32(%rbp), %rax
    movq    %rax, -32(%rbp)
    pxor    %xmm0, %xmm0
    cvtsi2sdq   -32(%rbp), %xmm0
    movsd   .LC2(%rip), %xmm1
    divsd   %xmm1, %xmm0
    movq    %xmm0, %rax
    movq    %rax, %xmm0
    leaq    .LC3(%rip), %rdi
    movl    $1, %eax
    call    printf@PLT
    call    clock@PLT
    movq    %rax, -24(%rbp)
    movl    -84(%rbp), %ecx
    movq    -40(%rbp), %rdx
    movq    -48(%rbp), %rsi
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    matrixlin
    call    clock@PLT
    subq    -24(%rbp), %rax
    movq    %rax, -24(%rbp)
    pxor    %xmm0, %xmm0
    cvtsi2sdq   -24(%rbp), %xmm0
    movsd   .LC2(%rip), %xmm1
    divsd   %xmm1, %xmm0
    movq    %xmm0, %rax
    movq    %rax, %xmm0
    leaq    .LC4(%rip), %rdi
    movl    $1, %eax
    call    printf@PLT
    movq    -80(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movq    -72(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movq    -64(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movl    $0, -92(%rbp)
    jmp .L22
.L23:
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -56(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rax
    movq    %rax, %rdi
    call    free@PLT
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -48(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rax
    movq    %rax, %rdi
    call    free@PLT
    movl    -92(%rbp), %eax
    cltq
    leaq    0(,%rax,8), %rdx
    movq    -40(%rbp), %rax
    addq    %rdx, %rax
    movq    (%rax), %rax
    movq    %rax, %rdi
    call    free@PLT
    addl    $1, -92(%rbp)
.L22:
    movl    -92(%rbp), %eax
    cmpl    -84(%rbp), %eax
    jl  .L23
    movq    -56(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movq    -48(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movq    -40(%rbp), %rax
    movq    %rax, %rdi
    call    free@PLT
    movl    $0, %eax
    movq    -8(%rbp), %rbx
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE8:
    .size   main, .-main
    .section    .rodata
    .align 8
.LC0:
    .long   -1717986918
    .long   1072798105
    .align 8
.LC1:
    .long   -858993459
    .long   1073794252
    .align 8
.LC2:
    .long   0
    .long   1093567616
    .ident  "GCC: (Ubuntu 10.3.0-1ubuntu1) 10.3.0"
    .section    .note.GNU-stack,"",@progbits
    .section    .note.gnu.property,"a"
    .align 8
    .long    1f - 0f
    .long    4f - 1f
    .long    5
0:
    .string  "GNU"
1:
    .align 8
    .long    0xc0000002
    .long    3f - 2f
2:
    .long    0x3
3:
    .align 8
4:
