I have two versions of a kernel that performs the same task -fill a linked cell list-, the difference between both kernels is the datatype to store particle position, the first one using a float array to store the positions (4 float per particle due to 128bit reads/writes), and the second uses a vec3f structure array to store the positions (a structure which holds 3 floats).
Doing some tests using nvprof, I found that the second kernel (which uses vec3f) ran faster than the first one:
 Time(%)      Time   Calls       Avg       Min       Max  Name
   42.88    37.26s       2    18.63s   23.97us    37.26s  adentu_grid_cuda_filling_kernel(int*, int*, int*, float*, int, _vec3f, _vec3f, _vec3i)
   11.00     3.93s       2     1.97s   25.00us     3.93s  adentu_grid_cuda_filling_kernel(int*, int*, int*, _vec3f*, int, _vec3f, _vec3f, _vec3i)
The tests are done trying to fill a linked cell list using 256 and 512000 particles.
My question is, what happened here? I supposed that float array should do a better memory access due to coalesced memory, versus the use of vec3f structure array which has unaligned memory. I missunderstood anything?
These are the kernels, the first kernel:
__global__ void adentu_grid_cuda_filling_kernel (int *head,
                                                 int *linked,
                                                 int *cellnAtoms,
                                                 float *pos, 
                                                 int nAtoms, 
                                                 vec3f origin, 
                                                 vec3f h,
                                                 vec3i nCell)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= nAtoms)
        return;
    vec3i cell;
    vec3f _pos = (vec3f){(float)pos[idx*4+0], (float)pos[idx*4+1], (float)pos[idx*4+2]};
    cell.x =  floor ((_pos.x - origin.x)/h.x);
    cell.y =  floor ((_pos.y - origin.y)/h.y);
    cell.z =  floor ((_pos.z - origin.z)/h.z);
    int c = nCell.x * nCell.y * cell.z + nCell.x * cell.y + cell.x;
    int i;
    if (atomicCAS (&head[c], -1, idx) != -1){
        i = head[c];
        while (atomicCAS (&linked[i], -1, idx) != -1)
                i = linked[i];
    }
    atomicAdd (&cellnAtoms[c], 1);
}
And this is the second kernel:
__global__ void adentu_grid_cuda_filling_kernel (int *head,
                                                 int *linked,
                                                 int *cellNAtoms,
                                                 vec3f *pos,
                                                 int nAtoms,
                                                 vec3f origin,
                                                 vec3f h,
                                                 vec3i nCell)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx >= nAtoms)
        return;
    vec3i cell;
    vec3f _pos = pos[idx];
    cell.x = floor ((_pos.x - origin.x)/h.x);
    cell.y = floor ((_pos.y - origin.y)/h.y);
    cell.z = floor ((_pos.z - origin.z)/h.z);
    int c = nCell.x * nCell.y * cell.z + nCell.x * cell.y + cell.x;
    int i;
    if (atomicCAS (&head[c], -1, idx) != -1){
        i = head[c];
        while (atomicCAS (&linked[i], -1, idx) != -1)
                i = linked[i];
    }
    atomicAdd (&cellNAtoms[c], 1);
}
This is the vec3f structure:
typedef struct _vec3f {float x, y, z} vec3f;
 
     
    