I want to write a kernel to perform computations that depends on all the unique quartets of indices (ij|kl). The code that generate all the unique quartets on the host is as follows:
#include <iostream>
using namespace std;
int main(int argc, char** argv)
{
    unsigned int i,j,k,l,ltop;
    unsigned int nao=7;
    for(i=0;i<nao;i++)
    {
        for(j=0;j<=i;j++)
        {
            for(k=0;k<=i;k++)
            {
                ltop=k;
                if(i==k)
                {
                    ltop=j;
                }
                for(l=0;l<=ltop; l++)
                {
                    printf("computing the ERI (%d,%d|%d,%d) \n",i,j,k,l);
                }
            }    
        }
    }
    int m = nao*(nao+1)/2;
    int eris_size = m*(m+1)/2;
    cout<<"The total size of the sack of ERIs is: "<<eris_size<<endl;
    return 0;
}
output:
computing the ERI (0,0|0,0) 
computing the ERI (1,0|0,0) 
computing the ERI (1,0|1,0) 
computing the ERI (1,1|0,0) 
computing the ERI (1,1|1,0) 
computing the ERI (1,1|1,1) 
computing the ERI (2,0|0,0) 
computing the ERI (2,0|1,0) 
computing the ERI (2,0|1,1) 
computing the ERI (2,0|2,0) 
computing the ERI (2,1|0,0) 
computing the ERI (2,1|1,0) 
computing the ERI (2,1|1,1) 
computing the ERI (2,1|2,0) 
computing the ERI (2,1|2,1) 
computing the ERI (2,2|0,0) 
computing the ERI (2,2|1,0) 
computing the ERI (2,2|1,1) 
computing the ERI (2,2|2,0) 
computing the ERI (2,2|2,1) 
computing the ERI (2,2|2,2) 
computing the ERI (3,0|0,0) 
computing the ERI (3,0|1,0) 
computing the ERI (3,0|1,1) 
computing the ERI (3,0|2,0) 
computing the ERI (3,0|2,1) 
computing the ERI (3,0|2,2) 
computing the ERI (3,0|3,0) 
computing the ERI (3,1|0,0) 
computing the ERI (3,1|1,0) 
computing the ERI (3,1|1,1) 
computing the ERI (3,1|2,0) 
computing the ERI (3,1|2,1) 
computing the ERI (3,1|2,2) 
computing the ERI (3,1|3,0) 
computing the ERI (3,1|3,1) 
computing the ERI (3,2|0,0) 
computing the ERI (3,2|1,0) 
computing the ERI (3,2|1,1) 
computing the ERI (3,2|2,0) 
computing the ERI (3,2|2,1) 
computing the ERI (3,2|2,2) 
computing the ERI (3,2|3,0) 
computing the ERI (3,2|3,1) 
computing the ERI (3,2|3,2) 
computing the ERI (3,3|0,0) 
computing the ERI (3,3|1,0) 
computing the ERI (3,3|1,1) 
computing the ERI (3,3|2,0) 
computing the ERI (3,3|2,1) 
computing the ERI (3,3|2,2) 
computing the ERI (3,3|3,0) 
computing the ERI (3,3|3,1) 
computing the ERI (3,3|3,2) 
computing the ERI (3,3|3,3)
I want to recover the same set of quartets using a CUDA kernel but I cannot get it. The code for CUDA that I have by now is as follows
#include <iostream>
#include <stdio.h>
using namespace std;
#define ABS(x)  (x<0)?-x:x
__global__
void test_kernel(int basis_size)
{
    unsigned int i_idx = threadIdx.x + blockIdx.x * blockDim.x;
    unsigned int j_idx = threadIdx.y + blockIdx.y * blockDim.y;
    // Building the quartets                                                                                                                                                  
    unsigned int i_orbital, j_orbital, k_orbital, l_orbital, i__1,j__1;
    unsigned int i_primitive, j_primitive, k_primitive, l_primitive;
    i_orbital = (i_idx + 1)%basis_size /*+1*/;
    j_orbital = (i__1 = (i_idx) / basis_size, ABS(i__1)) /*+ 1*/;
    k_orbital = (j_idx+1)%basis_size /*+1*/;
    l_orbital = (j__1 = (j_idx) / basis_size, ABS(j__1)) /*+ 1*/;
    unsigned int ltop;
    ltop=k_orbital;
    if(i_orbital==k_orbital)
    {
        ltop=j_orbital;
    }
    if(i_orbital<basis_size && j_orbital<=i_orbital && k_orbital<=i_orbital && l_orbital<=ltop)
        printf("computing the ERI (%d,%d|%d,%d)   \n", i_orbital, j_orbital,k_orbital,l_orbital);
}
int main(int argc, char *argv[])
{
    int nao = 7;
    cudaDeviceReset();
    /* partitioning from blocks to grids */
    int dimx = 8;
    int dimy = 8;
    dim3 block(dimx, dimy);   // we will try blocks of 8x8 threads                                                                                                            
    dim3 grid((nao+block.x-1)/block.x, (nao+block.y-1)/block.y);   // The grids are shaped accordingly                                                                        
    /* Launch the kernel */
    test_kernel<<<grid,block>>>(nao);
    cudaDeviceReset();
    return 0;
}
output:
computing the ERI (2,0|1,1)    
computing the ERI (3,1|3,1)    
computing the ERI (3,0|1,1)    
computing the ERI (1,1|1,1)    
computing the ERI (2,1|1,1)    
computing the ERI (3,1|1,1)    
computing the ERI (3,0|2,1)    
computing the ERI (2,1|2,1)    
computing the ERI (3,1|2,1)    
computing the ERI (1,0|1,0)    
computing the ERI (2,0|1,0)    
computing the ERI (3,0|1,0)    
computing the ERI (1,1|1,0)    
computing the ERI (2,1|1,0)    
computing the ERI (3,1|1,0)    
computing the ERI (2,0|2,0)    
computing the ERI (3,0|3,0)    
computing the ERI (3,0|2,0)    
computing the ERI (2,1|2,0)    
computing the ERI (3,1|3,0)    
computing the ERI (3,1|2,0)    
computing the ERI (1,0|0,0)    
computing the ERI (2,0|0,0)    
computing the ERI (3,0|0,0)    
computing the ERI (0,0|0,0)    
computing the ERI (1,1|0,0)    
computing the ERI (2,1|0,0)    
computing the ERI (3,1|0,0)
The quartets will drive the computation of repulsion integrals whose input parameters are stored in an array of size nao with 3
 
     
    