I have been running various tests to investigate this surprising result.
I wrote the test program below that combines various operations in the init phase and the loop:
#include <stdio.h>
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <sys/stat.h>
int alloc_size = 500 * 0x100000;  // 500 MB
char copy_buf[0x200000];    // 2MB
double getTS() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec/1000000.0;
}
// set a word on each page of a memory area
void memtouch(void *buf, size_t size) {
    uint64_t *p = buf;
    size /= sizeof(*p);
    for (size_t i = 0; i < size; i += 4096 / sizeof(*p))
        p[i] = 0;
}
// compute the sum of words on a memory area
uint64_t sum64(const void *buf, size_t size) {
    uint64_t sum = 0;
    const uint64_t *p = buf;
    size /= sizeof(*p);
    for (size_t i = 0; i < size; i++)
        sum += p[i];
    return sum;
}
void test1(int n, int init, int run) {
    int size = alloc_size;
    char msg[80];
    int pos = 0;
    double times[n+1];
    uint64_t sum = 0;
    double initTS = getTS();
    char *p = malloc(size);
    pos = snprintf(msg + pos, sizeof msg - pos, "malloc");
    if (init > 0) {
        memset(p, init - 1, size);
        pos += snprintf(msg + pos, sizeof msg - pos, "+memset%.0d", init - 1);
    } else
    if (init == -1) {
        memtouch(p, size);
        pos += snprintf(msg + pos, sizeof msg - pos, "+memtouch");
    } else
    if (init == -2) {
        sum = sum64(p, size);
        pos += snprintf(msg + pos, sizeof msg - pos, "+sum64");
    } else {
        /* leave p uninitialized */
    }
    pos += snprintf(msg + pos, sizeof msg - pos, "+rep(%d, ", n);
    if (run > 0) {
        pos += snprintf(msg + pos, sizeof msg - pos, "memset%.0d)", run - 1);
    } else
    if (run < 0) {
        pos += snprintf(msg + pos, sizeof msg - pos, "sum64)");
    } else {
        pos += snprintf(msg + pos, sizeof msg - pos, "memcpy)");
    }
    double startTS = getTS();
    for (int i = 0; i < n; i++) {
        if (run > 0) {
            memset(p, run - 1, size);
        } else
        if (run < 0) {
            sum = sum64(p, size);
        } else {
            int offset = 0;
            while (offset < size) {
                memcpy(copy_buf, p + offset, 0x200000);
                offset += 0x200000;
            }
        }
        times[i] = getTS();
    }
    double firstTS = times[0] - startTS;
    printf("%f + %f", startTS - initTS, firstTS);
    if (n > 2) {
        double avgTS = (times[n - 2] - times[0]) / (n - 2);
        printf(" / %f", avgTS);
    }
    if (n > 1) {
        double lastTS = times[n - 1] - times[n - 2];
        printf(" / %f", lastTS);
    }
    printf(" secs  %s", msg);
    if (sum != 0) {
        printf("  sum=%016llx", (unsigned long long)sum);
    }
    printf("\n");
    free(p);
}
int main(int argc, char *argv[]) {
    int n = 4;
    if (argc < 2) {
        test1(n, 0, 0);
        test1(n, 0, 1);
        test1(n, 0, -1);
        test1(n, 1, 0);
        test1(n, 1, 1);
        test1(n, 1, -1);
        test1(n, 2, 0);
        test1(n, 2, 1);
        test1(n, 2, -1);
        test1(n, -1, 0);
        test1(n, -1, 1);
        test1(n, -1, -1);
        test1(n, -2, 0);
        test1(n, -2, 1);
        test1(n, -2, -1);
    } else {
        test1(argc > 1 ? strtol(argv[1], NULL, 0) : n,
              argc > 2 ? strtol(argv[2], NULL, 0) : 0,
              argc > 3 ? strtol(argv[2], NULL, 0) : 0);
    }
    return 0;
}
Running it on an old linux box running Debian Linux 3.16.0-11-amd64, I got these timings:
The columns are
- init phase
- first iteration of the loop
- average of the second to penultimate iterations
- last iteration of the loop
- sequence of operations
0.000071 + 0.242601 / 0.113761 / 0.113711 secs  malloc+rep(4, memcpy)
0.000032 + 0.349896 / 0.125809 / 0.125681 secs  malloc+rep(4, memset)
0.000032 + 0.190461 / 0.049150 / 0.049210 secs  malloc+rep(4, sum64)
0.350089 + 0.186691 / 0.186705 / 0.186548 secs  malloc+memset+rep(4, memcpy)
0.350078 + 0.125603 / 0.125632 / 0.125531 secs  malloc+memset+rep(4, memset)
0.349931 + 0.105991 / 0.105859 / 0.105788 secs  malloc+memset+rep(4, sum64)
0.349860 + 0.186950 / 0.187031 / 0.186494 secs  malloc+memset1+rep(4, memcpy)
0.349584 + 0.125537 / 0.125525 / 0.125535 secs  malloc+memset1+rep(4, memset)
0.349620 + 0.106026 / 0.106114 / 0.105756 secs  malloc+memset1+rep(4, sum64)  sum=ebebebebebe80000
0.339846 + 0.186593 / 0.186686 / 0.186498 secs  malloc+memtouch+rep(4, memcpy)
0.340156 + 0.125663 / 0.125716 / 0.125751 secs  malloc+memtouch+rep(4, memset)
0.340141 + 0.105861 / 0.105806 / 0.105869 secs  malloc+memtouch+rep(4, sum64)
0.190330 + 0.113774 / 0.113730 / 0.113754 secs  malloc+sum64+rep(4, memcpy)
0.190149 + 0.400483 / 0.125638 / 0.125624 secs  malloc+sum64+rep(4, memset)
0.190214 + 0.049136 / 0.049170 / 0.049149 secs  malloc+sum64+rep(4, sum64)
The timings are consistent with the observations of the OP. I found an explanation that is consistent with the observed timings:
if the first access to a page is a read operation, the timings are substantially better than if the first operation is a write access.
Here are some observations consistent with this explanation:
- malloc()for a large 500MB block just makes a system call to map the memory, it does not access this memory and- callocprobably would do exactly the same thing.
- if you do not initialize this memory, it still gets mapped in RAM as zero initialized pages for security reasons.
- when you initialize the memory with memset, the first access to the whole block is a write access, then the timings for the loop are slower
- initializing the memory to all bytes 1produces exactly the same timings
- if instead I use memtouch, only writing the first word to zero, I get the same timings in the loop
- conversely, if instead of initializing the memory, I compute a checksum, it comes out as zero (which is not guarateed, but expected) and the timings in the loop are faster.
- if no access is performed to the block, then the timings in the loop depend on the operation performed: memcpyandsum64are faster because the first access is a read access, whilememsetis slower because the first access is a write access.
This seems specific to the linux kernel, I do not observe the same difference on macOS, but with a different processor. This behavior might be specific to older linux kernels and/or CPU and bridge architecture.
FINAL UPDATE: As commented by Peter Cordes, a read from a never-written anonymous page will get every page copy-on-write mapped to the same physical page of zeros, so you can get TLB misses but L1d cache hits when reading it. (Applies to the .bss, and memory from mmap(MAP_ANONYMOUS), like glibc calloc and malloc use for large allocations.) He wrote up some details with perf results for an experiment on Why is iterating though `std::vector` faster than iterating though `std::array`?
This explains why memcpy or just reading from memory that is only implicitly initialized to zero is faster than from memory explicitly written to. A good reason to use calloc() instead of malloc() + memset() for sparse data.