I'm trying to convert a 3x3 rgb "image" to grayscale. The image is just a 3d array. You can think of it being a 2d image, with each pixel having a 1x3 array of rgb values.
To do this, I've basically converted the 3d array into a 1D array. There is no issue with the conversion. I basically send this vector array to the cuda device and it returns me a 1x9 (one grayscale value for each pixel) processed vector array. I've tried to do this by first creating 3 blocks, each for RGB. Each block has 3x3 threads where each thread targets either the red, blue, or green value (determined by the block index) of a pixel. I then multiply each red, blue, or green value by a corresponding factor and add that value to the corresponding pixel of the output array.
However, what I've been getting is an array that is comprised of all zeroes.
#include <stdio.h>
//
__global__ void RGBToGrayScale(float *d_out, float *d_in, int rowCount, int colCount) {
    float grayScaleAddition;
    int temp = d_in[blockIdx.x * rowCount * colCount + threadIdx.x * colCount + threadIdx.y];
    switch(blockIdx.x) {
        case 0:
            grayScaleAddition = 0.299 * temp;
            break;
        case 1:
            grayScaleAddition = 0.587 * temp;
            break;
        case 2:
            grayScaleAddition = 0.114 * temp;
            break;
    }
    d_out[threadIdx.x * colCount + threadIdx.y] += grayScale;
}
int main() {
    int image[3][3][3] = { //3 rows, columns, and 3 rgb values for each pixel
        {{1, 3, 2}, {4, 5, 6}, {7, 8, 9}},
        {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
        {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}
    };
    const int IMAGE_ROW_COUNT = 3;
    const int IMAGE_COLUMN_COUNT = 3;
    const int ARRAY_BYTES = IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT * 3 * sizeof(float);
    //converting image to a 1D array
    float* h_in = (float*)malloc(ARRAY_BYTES);
    float* h_out = (float*)malloc(ARRAY_BYTES / 3);
    for (int i = 0; i < IMAGE_ROW_COUNT; ++i) {//no issue with conversion, checked manually
        for (int j = 0; j < IMAGE_COLUMN_COUNT; ++j) {
            for (int k = 0; k < 3; ++k) {
                h_in[k * IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT + i * IMAGE_COLUMN_COUNT + j] = float(image[i][j][k]);
            }
        }
    }
    //declare GPU memory pointers
    float* d_in;
    float* d_out;
    cudaMalloc((void **) &d_in, ARRAY_BYTES);
    cudaMalloc((void **) &d_out, ARRAY_BYTES / 3);
    //transfer array into GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
    RGBToGrayScale<<<3, dim3(IMAGE_ROW_COUNT, IMAGE_COLUMN_COUNT)>>>(d_out, d_in, IMAGE_ROW_COUNT, IMAGE_COLUMN_COUNT);
    //copy back the result array to CPU
    cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
    for (int i = 0; i < IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT; ++i) {
        printf("%f", h_out[i]);
        printf(((i % 3) != 2) ? "\t" : "\n");
    }
    cudaFree(d_in);
    cudaFree(d_out);
    free(h_in);
    free(h_out);
    return 0;
}
 
     
    