On the host side I'm reading in a 128 x 128 integer array with random values between 0-31. I have an Occurrences array that stores the values 0-31 and then on the device I am trying to execute a kernel that loops through the values in the 128 x 128 array and then counts the number of times 0-31 appears.
I am having issues with how to split up the blocks/threads in CUDA and how to get the kernel to provide communication back to the host and print out the number of occurrences of every element.This is my first time using CUDA and I would appreciate any constructive advice! Here is my code so far:
 #include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#define MAXR 16
#define MAXC 16
#define N 256
__global__ void count(int *arrayONE_d, int *occurrences_d, int *occurrences_final_d) {
    int count = 0;
    //provide unique thread ID
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int k;
    //for(k=0; k < 32;k++) {
    //  occurrences_d[k]=k;
//  }
    if(idx < N) {
        //for(k=0; k < MAXR*MAXC; k++) {
    for(int j=0; j<32; j++) {
            count =0;
        if(arrayONE_d[idx]==occurrences_d[j]){
            count+=1;
            occurrences_final_d[j] =count;
        }
        else {}
    }
    }
    //occurrences_final_d[0] = 77;
    }
}
int main(void) {
    //const int N = MAXR*MAXC;
    int arr1_h[MAXR][MAXC];
    //int *occurrences_h[0][32];
    //creating arrays for the device (GPU)
    //int *arr1_d;
    int occurrences_h[32];
    int *occurrences_d;
    int *occurrences_final_h[32] = {0};
    int *occurrences_final_d;
    int *arrayONE_h[256] = {0};
    int *arrayONE_d;
    int i, j;
    // allocating memory for the arrays on the device
    cudaMalloc( (void**) &arrayONE_d, MAXR*MAXC*sizeof(int)); // change to 16384 when using 128x128
    cudaMalloc( (void**) &occurrences_d,  32* sizeof(int));
    cudaMalloc( (void**) &occurrences_final_d, 32*sizeof(int));
    /*
    for(i=0; i < 32; i++) {
        occurrences_h[i] = i;
    }
/*
 *
 */
    //Reading in matrix from .txt file and storing it in arr1 on the host (CPU)
    FILE *fp;
    fp =fopen("arrays16.txt","r");
     // this loop takes the information from .txt file and puts it into arr1 matrix
    for(i=0;i<MAXR;i++) {
        for(j=0;j<MAXC;j++)
        {
            fscanf(fp,"%d\t", &arr1_h[i][j]);
        }
    }
    for(i=0;i<MAXR;i++) {
        printf("\n");
        for(j=0;j<MAXC;j++) {
            //printf("d\t", arr1_h[i][j]);
        }
        printf("\n\n");
    }
    int x,y;
    int z=0;
// this loop flattens the 2d array and makes it a 1d array of length MAXR*MAXC
    for(x=0;x<MAXR;x++)
    {
        for(y=0;y<MAXC;y++)
        {
            //  printf("**%d   ",arr1_h[x][y]);
            arrayONE_h[z]= &arr1_h[x][y];
            z++;
        }
    }
    for(x=0; x < 256; x++) {
        printf("%d\n", *arrayONE_h[x]);
        //return 0;
    }
    int length = sizeof(arrayONE_h)/sizeof(arrayONE_h[0]);
    printf("\n\n");
    printf("**LENGTH = %d", length);
    // copying the arrays/memory from the host to the device (GPU)
    cudaMemcpy(arrayONE_d, &arrayONE_h, MAXR*MAXC*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(occurrences_d, &occurrences_h, 32*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(occurrences_final_d, &occurrences_final_h, 32*sizeof(int), cudaMemcpyHostToDevice);
    // how many blocks we will allocate
    //dim3 DimGrid();
    //how many threads per block we will allocate
    dim3 DimBlock(256);
    //kernel launch against the GPU
    count<<<1, DimBlock>>>(arrayONE_d,occurrences_d,occurrences_final_d);
    //copy the arrays post-computation from the device back to the host (CPU)
    cudaMemcpy(&occurrences_final_h, occurrences_final_d, 32*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(&occurrences_h, occurrences_d, 32*sizeof(int), cudaMemcpyDeviceToHost);
    // some error checking - run this with cuda-memcheck when executing your code
    cudaError_t errSync  = cudaGetLastError();
    cudaError_t errAsync = cudaDeviceSynchronize();
    if (errSync != cudaSuccess)
        printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
    if (errAsync != cudaSuccess)
        printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
    //free up the memory of the device arrays
    cudaFree(arrayONE_d);
    cudaFree(occurrences_d);
    cudaFree(occurrences_final_d);
    //print out the number of occurrences of each 0-31 value
    for(i=0;i<32;i++) {
        printf("\n");
        printf("%d\n",occurrences_final_h[i]);
    }
}
 
    