So, I'm starting to get so frustrated with CUDA that I decided to write the simplest piece of code I could, just to get my bearings. But something seems to be going right over my head. In my code, I'm just adding two arrays, and then storing them in a third array, like this:
#include <stdio.h>
#include <stdlib.h>
__global__ void add(int* these, int* those, int* answers)
{
    int tid = blockIdx.x;
    answers[tid] = these[tid] + those[tid];
}
int main()
{
    int these[50];
    int those[50];
    int answers[50];
    int *devthese;
    int *devthose;
    int *devanswers;
    cudaMalloc((void**)&devthese, 50 * sizeof(int));
    cudaMalloc((void**)&devthose, 50 * sizeof(int));
    cudaMalloc((void**)&devanswers, 50 * sizeof(int));
    int i;
    for(i = 0; i < 50; i++)
    {
        these[i] = i;
        those[i] = 2 * i;
    }
    cudaMemcpy(devthese, these, 50 * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(devthose, those, 50 * sizeof(int), cudaMemcpyHostToDevice);
    add<<<50,1>>>(devthese, devthose, devanswers);
    cudaMemcpy(answers, devanswers, 50 * sizeof(int), cudaMemcpyDeviceToHost);
    for(i = 0; i < 50; i++)
    {
        fprintf(stderr,"%i\n",answers[i]);
    }
    return 0;
}
However, the int values that are being printed out aren't following the sequence of multiples of 3, which is what I was expecting. Can anyone explain what is going wrong?
 
    