Just a little c++ code, confirmed behavior in java.
This is example code what reproduce this behavior compiled with Visual Studio 2019 Release x64. I got:
611ms for just increment element.
631ms for increment element with cache, so additional 20ms for overhead.
But when i add heavy op for before each increment(i choised random number generation) and got:
2073ms for just increment element.
1432ms for increment element using cache.
I have intel cpu 10700K, and 3200RAM if it matter.
#include <iostream>
#include <random>
#include <chrono>
#include <cstdlib>
#define ARR_SIZE 256 * 256 * 256 
#define ACCESS_SIZE 256 * 256
#define CACHE_SIZE 1024 
#define ITERATIONS 1000
using namespace std;
using chrono::high_resolution_clock;
using chrono::duration_cast;
using chrono::milliseconds;
int* arr;
int* cache;
int counter = 0;
void flushCache() {
    for (int j = 0; j < CACHE_SIZE; ++j)
    {
        ++arr[cache[j]];
    }
    counter = 0;
}
void incWithCache(int i) {
    cache[counter] = i;
    ++counter;
    if (counter == CACHE_SIZE) {
        flushCache();
    }
}
void incWithoutCache(int i) {
    ++arr[i];
}
int heavyOp() {
    return rand() % 107;
}
void main()
{
    arr = new int[ARR_SIZE];
    cache = new int[CACHE_SIZE];
    int* access = new int[ACCESS_SIZE];
    random_device rd;
    mt19937 gen(rd());
    for (int i = 0; i < ACCESS_SIZE; ++i) {
        access[i] = gen() % (ARR_SIZE);
    }
    for (int i = 0; i < ARR_SIZE; ++i) {
        arr[i] = 0;
    }
    auto t1 = high_resolution_clock::now();
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (int i = 0; i < ACCESS_SIZE; ++i) {
            incWithoutCache(access[i]);
        }
    }
    auto t2 = high_resolution_clock::now();
    auto ms_int = duration_cast<milliseconds>(t2 - t1);
    cout << "Time without cache " << ms_int.count() << "ms\n";
    t1 = high_resolution_clock::now();
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (int i = 0; i < ACCESS_SIZE; ++i) {
            incWithCache(access[i]);
        }
        flushCache();
    }
    t2 = high_resolution_clock::now();
    ms_int = duration_cast<milliseconds>(t2 - t1);
    cout << "Time with cache " << ms_int.count() << "ms\n";
    t1 = high_resolution_clock::now();
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (int i = 0; i < ACCESS_SIZE; ++i) {
            heavyOp();
            incWithoutCache(access[i]);
        }
    }
    t2 = high_resolution_clock::now();
    ms_int = duration_cast<milliseconds>(t2 - t1);
    cout << "Time without cache and time between " << ms_int.count() << "ms\n";
    t1 = high_resolution_clock::now();
    for (int iter = 0; iter < ITERATIONS; ++iter) {
        for (int i = 0; i < ACCESS_SIZE; ++i) {
            heavyOp();
            incWithCache(access[i]);
        }
        flushCache();
    }
    t2 = high_resolution_clock::now();
    ms_int = duration_cast<milliseconds>(t2 - t1);
    cout << "Time with cache and time between " << ms_int.count() << "ms\n";
}
 
     
    