Linux perf_event_open system call with config = PERF_COUNT_HW_INSTRUCTIONS
perf is likely what OP wants as shown at https://stackoverflow.com/a/10114325/895245 but just for completeness, I'm going to show how to do this from inside a C program if you control the source code.
This method can allow for more precise measurements of a specific region of interest within the program. It can also get separate cache hit/miss counts for each different cache level. This syscall likely shares the same backend as perf.
This example is basically the same as Quick way to count number of instructions executed in a C program but with PERF_TYPE_HW_CACHE. By doing:
man perf_event_open
you can see that in this examples we are counting only:
- L1 data cache (PERF_COUNT_HW_CACHE_L1D)
- reads (PERF_COUNT_HW_CACHE_OP_READ), not writes of prefetches
- misses (PERF_COUNT_HW_CACHE_RESULT_MISS), not hits
perf_event_open.c
#define _GNU_SOURCE
#include <asm/unistd.h>
#include <linux/perf_event.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <unistd.h>
#include <inttypes.h>
static long
perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
                int cpu, int group_fd, unsigned long flags)
{
    int ret;
    ret = syscall(__NR_perf_event_open, hw_event, pid, cpu,
                    group_fd, flags);
    return ret;
}
int
main(int argc, char **argv)
{
    struct perf_event_attr pe;
    long long count;
    int fd;
    char *chars, c;
    uint64_t n;
    if (argc > 1) {
        n = strtoll(argv[1], NULL, 0);
    } else {
        n = 10000;
    }
    chars = malloc(n * sizeof(char));
    memset(&pe, 0, sizeof(struct perf_event_attr));
    pe.type = PERF_TYPE_HW_CACHE;
    pe.size = sizeof(struct perf_event_attr);
    pe.config = PERF_COUNT_HW_CACHE_L1D |
                PERF_COUNT_HW_CACHE_OP_READ << 8 |
                PERF_COUNT_HW_CACHE_RESULT_MISS << 16;
    pe.disabled = 1;
    pe.exclude_kernel = 1;
    // Don't count hypervisor events.
    pe.exclude_hv = 1;
    fd = perf_event_open(&pe, 0, -1, -1, 0);
    if (fd == -1) {
        fprintf(stderr, "Error opening leader %llx\n", pe.config);
        exit(EXIT_FAILURE);
    }
    /* Write the memory to ensure misses later. */
    for (size_t i = 0; i < n; i++) {
        chars[i] = 1;
    }
    ioctl(fd, PERF_EVENT_IOC_RESET, 0);
    ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
    /* Read from memory. */
    for (size_t i = 0; i < n; i++) {
        c = chars[i];
    }
    ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
    read(fd, &count, sizeof(long long));
    printf("%lld\n", count);
    close(fd);
    free(chars);
}
With this, I get results increasing linearly like:
./main.out 100000
# 1565
./main.out 1000000
# 15632
./main.out 10000000
# 156641
From this we can estimate a cache line size of: 100000/1565 ~ 63.9 which almost exactly matches the exact value of 64 according to getconf LEVEL1_DCACHE_LINESIZE on my computer, so I guess it is working.