I am new to GPU programming (and rather rusty in C) so this might be a rather basic question with an obvious bug in my code. What I am trying to do is take a 2 dimensional array and find the sum of each column for every row. So If I have a 2D array that contains:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 2 4 6 8 10 12 14 16 18
I want to get an array that contains the following out:
45
45
90
The code I have so far is not returning the correct output and I'm not sure why. I'm guessing it is because I am not handling the indexing in the kernel properly. But it could be that I am not using the memory correctly since I adapted this from an over-simplified 1 dimensional example and the CUDA Programming Guide (section 3.2.2) makes a rather big and not very well described jump for a beginner between 1 and 2 dimensional arrays.
My incorrect attempt:
#include <stdio.h>
#include <stdlib.h>
// start with a small array to test
#define ROW 3
#define COL 10
__global__ void collapse( int *a, int *c){
    /*
       Sum along the columns for each row of the 2D array.
    */
    int total = 0;
    // Loop to get total, seems wrong for GPUs but I dont know a better way
    for (int i=0; i < COL; i++){
        total = total + a[threadIdx.y + i];
    }
    c[threadIdx.x] = total;
}
int main( void ){
    int array[ROW][COL];      // host copies of a, c
    int c[ROW];
    int *dev_a;      // device copies of a, c (just pointers)
    int *dev_c;
    // get the size of the arrays I will need
    int size_2d = ROW * COL * sizeof(int);
    int size_c = ROW * sizeof(int);
    // Allocate the memory
    cudaMalloc( (void**)&dev_a, size_2d);
    cudaMalloc( (void**)&dev_c, size_c);
    // Populate the 2D array on host with something small and known as a test
    for (int i=0; i < ROW; i++){
        if (i == ROW - 1){
            for (int j=0; j < COL; j++){
                array[i][j] = (j*2);
                printf("%i ", array[i][j]);
            }
        } else {
            for (int j=0; j < COL; j++){
                array[i][j] = j;
                printf("%i ", array[i][j]);
            }
        }
        printf("\n");
    }
    // Copy the memory
    cudaMemcpy( dev_a, array, size_2d, cudaMemcpyHostToDevice );
    cudaMemcpy( dev_c, c, size_c, cudaMemcpyHostToDevice );
    // Run the kernal function
    collapse<<< ROW, COL >>>(dev_a, dev_c);
    // copy the output back to the host
    cudaMemcpy( c, dev_c, size_c, cudaMemcpyDeviceToHost );
    // Print the output
    printf("\n");
    for (int i = 0; i < ROW; i++){
        printf("%i\n", c[i]);
    }
    // Releasae the memory
    cudaFree( dev_a );
    cudaFree( dev_c );
}
Output:
0 1 2 3 4 5 6 7 8 9
0 1 2 3 4 5 6 7 8 9
0 2 4 6 8 10 12 14 16 18
45
45
45
 
     
    