I just tested this loop on my Mac, and it beats memcmp:
uint64_t *p = (uint64_t *)buffer1;
uint64_t compare;
memset(&compare, 1, sizeof compare);
for (i = 0; i < length/sizeof compare; i++)
{
    if (p[i] != compare)
        break;
}
Complete example code:
#include <stdio.h>
#include <string.h>
#include <sys/resource.h>
#include <time.h>
#include <stdlib.h>
#include <stdint.h>
// from: http://www.gnu.org/software/libc/manual/html_node/Elapsed-Time.html
void timeval_subtract(struct timeval *result, struct timeval *x, struct timeval *y)
{
    /* Perform the carry for the later subtraction by updating y. */
    if (x->tv_usec < y->tv_usec)
    {
        int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
        y->tv_usec -= 1000000 * nsec;
        y->tv_sec += nsec;
    }
    if (x->tv_usec - y->tv_usec > 1000000)
    {
        int nsec = (x->tv_usec - y->tv_usec) / 1000000;
        y->tv_usec += 1000000 * nsec;
        y->tv_sec -= nsec;
    }
    /* Compute the time remaining to wait. tv_usec is certainly positive. */
    result->tv_sec = x->tv_sec - y->tv_sec;
    result->tv_usec = x->tv_usec - y->tv_usec;
}
int main(int argc, char **argv)
{
    struct rusage before;
    struct rusage after;
    struct timeval diff;
    size_t i;
    size_t length = strtoull(argv[1], NULL, 0);
    char *buffer1 = malloc(length);
    char *buffer2 = malloc(length);
    printf("filling...");
    fflush(stdout);
    memset(buffer1, 1, length);
    memset(buffer2, 1, length);
    printf(" done\n");
    getrusage(RUSAGE_SELF, &before);
    uint64_t *p = (uint64_t *)buffer1;
    uint64_t compare;
    memset(&compare, 1, sizeof compare);
    for (i = 0; i < length/sizeof compare; i++)
    {
        if (p[i] != compare)
            break;
    }
    if (i == length/sizeof compare)
        i = 0;
    getrusage(RUSAGE_SELF, &after);
    printf("\nloop (returned %zu):\n", i);
    timeval_subtract(&diff, &after.ru_utime, &before.ru_utime);
    printf("User:   %ld.%06d s\n", diff.tv_sec, diff.tv_usec);
    timeval_subtract(&diff, &after.ru_stime, &before.ru_stime);
    printf("System: %ld.%06d s\n", diff.tv_sec, diff.tv_usec);
    getrusage(RUSAGE_SELF, &before);
    i = memcmp(buffer1, buffer2, length);
    getrusage(RUSAGE_SELF, &after);
    printf("\nmemcmp (returned %zu):\n", i);
    timeval_subtract(&diff, &after.ru_utime, &before.ru_utime);
    printf("User:   %ld.%06d s\n", diff.tv_sec, diff.tv_usec);
    timeval_subtract(&diff, &after.ru_stime, &before.ru_stime);
    printf("System: %ld.%06d s\n", diff.tv_sec, diff.tv_usec);
    return 0;
}
And run results:
$ make
clang -Wall -Wextra -Werror -O3 -g -o example example.c
./example 0x10000000
filling... done
loop (returned 0):
User:   0.024078 s
System: 0.000011 s
memcmp (returned 0):
User:   0.036752 s
System: 0.000017 s
Maybe you can do something similar?
Note: For those concerned about cache warming, I also tried with the memcmp before the loop and got the same results.