As of 2023, the best way to get compilers to generate good vector code is to iterate over chunks the size of your vector registers.  On AVX2 or above, those would be 256 bits, or 32 bytes.
#include <omp.h>
#include <stdalign.h>
#include <stddef.h>
#include <stdint.h>
#define ALIGNMENT 16U
size_t countEqualBytes(const size_t n, const uint8_t a[n], const uint8_t b[n]) {
    size_t sum = 0;
    size_t i = 0;
    if (n >= 32U) {
        const size_t sentinel = n - 31U;
        // #pragma omp parallel for reduction(+:sum) schedule(static)
        for (i = 0; i < sentinel; i += 32U) {
            sum += (size_t)((a[i] == b[i]) +
              (a[i + 1] == b[i + 1]) +
              (a[i + 2] == b[i + 2]) +
              (a[i + 3] == b[i + 3]) +
              (a[i + 4] == b[i + 4]) +
              (a[i + 5] == b[i + 5]) +
              (a[i + 6] == b[i + 6]) +
              (a[i + 7] == b[i + 7]) +
              (a[i + 8] == b[i + 8]) +
              (a[i + 9] == b[i + 9]) +
              (a[i + 10] == b[i + 10]) +
              (a[i + 11] == b[i + 11]) +
              (a[i + 12] == b[i + 12]) +
              (a[i + 13] == b[i + 13]) +
              (a[i + 14] == b[i + 14]) +
              (a[i + 15] == b[i + 15]) +
              (a[i + 16] == b[i + 16]) +
              (a[i + 17] == b[i + 17]) +
              (a[i + 18] == b[i + 18]) +
              (a[i + 19] == b[i + 19]) +
              (a[i + 20] == b[i + 20]) +
              (a[i + 21] == b[i + 21]) +
              (a[i + 22] == b[i + 22]) +
              (a[i + 23] == b[i + 23]) +
              (a[i + 24] == b[i + 24]) +
              (a[i + 25] == b[i + 25]) +
              (a[i + 26] == b[i + 26]) +
              (a[i + 27] == b[i + 27]) +
              (a[i + 28] == b[i + 28]) +
              (a[i + 29] == b[i + 29]) +
              (a[i + 30] == b[i + 30]) +
              (a[i + 31] == b[i + 31]));
        }
    }
    for (; i<n; i++) {
        sum += (a[i] != b[i]);
    }
    return sum;
}
Clang 16 or ICX 2022 with -std=c17 -O3 -march=x86-64-v4 are able to compile the critical loop of this to:
.LBB0_5:                                # =>This Inner Loop Header: Depth=1
        vmovdqu         ymm0, ymmword ptr [rsi + r10]
        vmovdqu         ymm1, ymmword ptr [rsi + r10 + 32]
        vmovdqu         ymm2, ymmword ptr [rsi + r10 + 64]
        vmovdqu         ymm3, ymmword ptr [rsi + r10 + 96]
        vpcmpeqb        k0, ymm0, ymmword ptr [rdx + r10]
        kmovd           ebx, k0
        popcnt          ebx, ebx
        add             rbx, rax
        vpcmpeqb        k0, ymm1, ymmword ptr [rdx + r10 + 32]
        kmovd           eax, k0
        popcnt          eax, eax
        add             rax, rbx
        vpcmpeqb        k0, ymm2, ymmword ptr [rdx + r10 + 64]
        kmovd           ebx, k0
        popcnt          ebx, ebx
        add             rbx, rax
        vpcmpeqb        k0, ymm3, ymmword ptr [rdx + r10 + 96]
        kmovd           eax, k0
        popcnt          eax, eax
        add             rax, rbx
        sub             r10, -128
        add             r9, -4
        jne             .LBB0_5
Which is this, unrolled four times:
.LBB0_8:                                # =>This Inner Loop Header: Depth=1
        vmovdqu         ymm0, ymmword ptr [r10 + rbx]
        vpcmpeqb        k0, ymm0, ymmword ptr [r9 + rbx]
        kmovd           ecx, k0
        popcnt          ecx, ecx
        add             rax, rcx
        add             rbx, 32
        cmp             r8, rbx
        jne            .LBB0_8
Although this uses an AVX512VL instruction, ICX can also vectorize for AVX or AVX2.
If you want to multi-thread the function as well as vectorize, add -fiopenmp on ICX/ICPX, or -fopenmp on Clang/GCC, and uncomment the #pragma omp directive.  Unfortunately, this only accepts a rigid format for the for statement, requiring a nested if block around the for (that otherwise could have been an extra clause in the loop condition: n > 31U && i < n - 31U).
Since x96 CPUs load data faster into registers faster when it’s aligned on 16-byte boundaries, you also want to declare your input arrays alignas(ALIGNMENT).
This is as fast and as portable as I was able to get it.  However, you should see this answer to a very similar question by @harold, which combines a 4,096-byte outer loop with a 32-byte inner loop, then a second loop to perform a vertical add.  The inner loop is also shorter by one instruction.