I'm trying to make a sum using the CUB reduction method.
The big problem is: I'm not sure how to return the values of each block to the Host when using 2-dimensional grids.
#include <iostream>
#include <math.h>
#include <cub/block/block_reduce.cuh>
#include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh>
#include <iomanip>
#define nat 1024
#define BLOCK_SIZE 32
#define GRID_SIZE 32
struct frame
{
   int  natm;
   char  title[100];
   float conf[nat][3];
};
using namespace std;
using namespace cub;
__global__
void add(frame* s, float L, float rc, float* blocksum)
{
int i = blockDim.x*blockIdx.x + threadIdx.x;
int j = blockDim.y*blockIdx.y + threadIdx.y;
float E=0.0, rij, dx, dy, dz;
// Your calculations first so that each thread holds its result
  dx = fabs(s->conf[j][0] - s->conf[i][0]);
  dy = fabs(s->conf[j][1] - s->conf[i][1]);
  dz = fabs(s->conf[j][2] - s->conf[i][2]);
  dx = dx - round(dx/L)*L;
  dy = dy - round(dy/L)*L;
  dz = dz - round(dz/L)*L;
   rij = sqrt(dx*dx + dy*dy + dz*dz);
  if ((rij <= rc) && (rij > 0.0))
    {E =  (4*((1/pow(rij,12))-(1/pow(rij,6))));}
//  E = 1.0;
__syncthreads();
// Block wise reduction so that one thread in each block holds sum of thread results
typedef cub::BlockReduce<float, BLOCK_SIZE, BLOCK_REDUCE_RAKING, BLOCK_SIZE> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
float aggregate = BlockReduce(temp_storage).Sum(E);
if (threadIdx.x == 0 && threadIdx.y == 0)
    blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
}
int main(void)
{
  frame  * state = (frame*)malloc(sizeof(frame));
  float *blocksum = (float*)malloc(GRID_SIZE*GRID_SIZE*sizeof(float));
  state->natm = nat; //inicializando o numero de atomos;
  char name[] = "estado1";
  strcpy(state->title,name);
  for (int i = 0; i < nat; i++) {
    state->conf[i][0] = i;
    state->conf[i][1] = i;
    state->conf[i][2] = i;
  }
  frame * d_state;
  float *d_blocksum;
  cudaMalloc((void**)&d_state, sizeof(frame));
  cudaMalloc((void**)&d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)));
  cudaMemcpy(d_state, state, sizeof(frame),cudaMemcpyHostToDevice);
  dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
  dim3 gridBlock(GRID_SIZE,GRID_SIZE);
  add<<<gridBlock,dimBlock>>>(d_state, 3000, 15, d_blocksum);
  cudaError_t status =  cudaMemcpy(blocksum, d_blocksum, ((GRID_SIZE*GRID_SIZE)*sizeof(float)),cudaMemcpyDeviceToHost);
  float Etotal = 0.0;
  for (int k = 0; k < GRID_SIZE*GRID_SIZE; k++){
       Etotal += blocksum[k];
  }
 cout << endl << "energy: " << Etotal << endl;
  if (cudaSuccess != status)
  {
    cout << cudaGetErrorString(status) << endl;
  }
 // Free memory
  cudaFree(d_state);
  cudaFree(d_blocksum);
  return cudaThreadExit();
}
What is happening is that if the value of GRID_SIZE is the same asBLOCK_SIZE, as written above. The calculation is correct. But if I change the value of GRID_SIZE, the result goes wrong. Which leads me to think that the error is in this code:
blocksum[blockIdx.x*blockDim.y + blockIdx.y] = aggregate;
The idea here is to return a 1D array, which contains the sum of each block.
I do not intend to change the BLOCK_SIZE value, but the value of GRID_SIZE depends on the system I'm looking at, I intend to use values greater than 32 (always multiples of that).
I looked for some example that use 2D grid with CUB, but did not find.
I really new in CUDA program, maybe I'm making a mistake.
edit: I put the complete code. For comparison, when I calculate these exact values for a serial program, it gives me energy: -297,121
 
    