CUDA: Group every n-th point of array passed to GPU

Question

I am trying to implement k-means algorithm on CUDA using Tesla card on external Unix. I read input file and store coordinates of all data points in dataX and dataY arrays. The next step is to select every centreInterval-th point and store it in another array allocated in GPU memory. However, I have no idea how may I even check what's the problem if all I can get is 'Segmentation error' and from obvious reasons can't print any kind of output from kernel.

EDIT 2: I simplified this example to the shortest possible solution. I found my solution during process, but decided to provide the version, which was not solved yet in this question to make more clear what caused the problem.

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#define BLOCK_SIZE 16

// My kernel - Selects some centres at the beginning of algorithm and stores it at appropriate place
__global__ void kMeansSelectInitialCentres(float* d_dataX, float* d_dataY, float* d_centresX, float* d_centresY, int centreInterval) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = i * centreInterval;
    d_centresX[i] = d_dataX[idx];
    d_centresY[i] = d_dataY[idx];
}

// Simplified example
int main(int argn, char ** argc) {

    // My data - let's say it is 32 floats in each
    int dataSize = 32;
    float* dataX = new float[dataSize];
    float* dataY = new float[dataSize]; 

    // Fill arrays with numbers
    for (int i = 0; i < dataSize; i++) {
        dataX[i] = i;
        dataY[i] = i;
    }

    // Interval - we select first number, then 1 + N * centreInterval 
    int centreInterval = 2;

    // There I will store my results in program
    int centreSize = dataSize / centreInterval;
    float* centresX = new float[centreSize];
    float* centresY = new float[centreSize];

    // Pointers to the arrays stored in GPU memory
    float* d_dataX;
    float* d_dataY;
    float* d_centresX;
    float* d_centresY;

// Allocate memory for those arrays
    // Calculate how much space in memory do we need for this
    size_t d_centreSize = sizeof(float) * centreSize;
    size_t d_dataSize = sizeof(float) * dataSize;

    // Memory for raw data
    cudaMalloc((void**)&d_dataX, d_dataSize);   
    cudaMalloc((void**)&d_dataY, d_dataSize);

    // Copy raw data to the device memory so we can operate on it freely
    cudaMemcpy(d_dataY, dataY, d_dataSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dataX, dataX, d_dataSize, cudaMemcpyHostToDevice);

    // Memory for centre results
    cudaMalloc((void**)&d_centresX, d_dataSize);
    cudaMalloc((void**)&d_centresY, d_dataSize);

    // Call kernel
    dim3 dimBlock(BLOCK_SIZE);
    dim3 dimGridK((centreSize + dimBlock.x) / dimBlock.x);
    kMeansSelectInitialCentres <<<dimGridK, dimBlock>>> (d_dataX, d_dataY, d_centresX, d_centresY, centreInterval);

    // Check results - we get every n-th point
    float* check_x = new float[centreSize];
    float* check_y = new float[centreSize];

    cudaMemcpy(check_x, d_centresX, d_dataSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(check_y, d_centresY, d_dataSize, cudaMemcpyDeviceToHost);

    printf("X: ");  
    for (int i = 0; i < centreSize; i++)
        printf("%.2f ", check_x[i]);
    printf("\nY: ");
    for (int i = 0; i < centreSize; i++)
        printf("%.2f ", check_y[i]);
    printf("\n");

}

Main question: What is wrong with this kernel / check-out of data?

Side question: Is there any fair way to debug program kernels in such situations?

some suggestions: (1) provide a [MCVE](http://stackoverflow.com/help/mcve), otherwise you probably won't get an answer; (2) use [proper error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api/14038590#14038590); (3) use [cuda-memcheck](http://docs.nvidia.com/cuda/cuda-memcheck/) — m.s., May 30 '15 at 15:15
Simplifying gave me a solution - I'll make a function out of this example. +1 for (2) - very useful link for error checking. Thanks a lot! :) — irchris102, May 30 '15 at 21:45

score 2 · Accepted Answer · answered May 30 '15 at 21:40

So, here's the solution I came up with after simplifying my case. There was a problem with memory usage - I tried to store / read different amount of data than I claimed to use when allocating it. I hope it will be helpful for anyone in the future:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
#include <math.h>
#include <time.h>
#include <unistd.h>
#define BLOCK_SIZE 16

// My kernel - Selects some centres at the beginning of algorithm and stores it at appropriate place
__global__ void kMeansSelectInitialCentres(float* d_dataX, float* d_dataY, float* d_centresX, float* d_centresY, int centreInterval) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int idx = i * centreInterval;
    d_centresX[i] = d_dataX[idx];
    d_centresY[i] = d_dataY[idx];
}

// Simplified example
int main(int argn, char ** argc) {

    // My data - let's say it is 32 floats in each
    int dataSize = 32;
    float* dataX = new float[dataSize];
    float* dataY = new float[dataSize]; 

    // Fill arrays with numbers
    for (int i = 0; i < dataSize; i++) {
        dataX[i] = i;
        dataY[i] = i;
    }

    // Interval - we select first number, then 1 + N * centreInterval 
    int centreInterval = 2;

    // There I will store my results in program
    int centreSize = dataSize / centreInterval;
    float* centresX = new float[centreSize];
    float* centresY = new float[centreSize];

    // Pointers to the arrays stored in GPU memory
    float* d_dataX;
    float* d_dataY;
    float* d_centresX;
    float* d_centresY;

// Allocate memory for those arrays
    // Calculate how much space in memory do we need for this
    size_t d_centreSize = sizeof(float) * centreSize;
    size_t d_dataSize = sizeof(float) * dataSize;

    // Memory for raw data
    cudaMalloc((void**)&d_dataX, d_dataSize);   
    cudaMalloc((void**)&d_dataY, d_dataSize);

    // Copy raw data to the device memory so we can operate on it freely
    cudaMemcpy(d_dataY, dataY, d_dataSize, cudaMemcpyHostToDevice);
    cudaMemcpy(d_dataX, dataX, d_dataSize, cudaMemcpyHostToDevice);

    // Memory for centre results
    cudaMalloc((void**)&d_centresX, d_centreSize);
    cudaMalloc((void**)&d_centresY, d_centreSize);

    // Call kernel
    dim3 dimBlock(BLOCK_SIZE);
    dim3 dimGridK((centreSize + dimBlock.x) / dimBlock.x);
    kMeansSelectInitialCentres <<<dimGridK, dimBlock>>> (d_dataX, d_dataY, d_centresX, d_centresY, centreInterval);

    // Check results - we get every n-th point
    float* check_x = new float[centreSize];
    float* check_y = new float[centreSize];

    cudaMemcpy(check_x, d_centresX, d_centreSize, cudaMemcpyDeviceToHost);
    cudaMemcpy(check_y, d_centresY, d_centreSize, cudaMemcpyDeviceToHost);

    printf("X: ");  
    for (int i = 0; i < centreSize; i++)
        printf("%.2f ", check_x[i]);
    printf("\nY: ");
    for (int i = 0; i < centreSize; i++)
        printf("%.2f ", check_y[i]);
    printf("\n");

}

Think this might be helpful example, hope downvotes won't make people ignore it. — irchris102, May 30 '15 at 22:00

CUDA: Group every n-th point of array passed to GPU

1 Answers1