I am learning some basic CUDA programming. I am trying to initialize an array on the Host with host_a[i] = i. This array consists of N = 128 integers. I am launching a kernel with 1 block and 128 threads per block, in which I want to square the integer at index i.
My questions are:
- How do I come to know whether the kernel gets launched or not? Can I use - printfwithin the kernel?
- The expected output for my program is a space-separated list of squares of integers - 
1 4 9 16 ... . 
What's wrong with my code, since it outputs 1 2 3 4 5 ...
Code:
#include <iostream>
#include <numeric>
#include <stdlib.h>
#include <cuda.h>
const int N = 128;
__global__ void f(int *dev_a) {
    unsigned int tid = threadIdx.x;
    if(tid < N) {
        dev_a[tid] = tid * tid;
    }
}
int main(void) {
    int host_a[N];
    int *dev_a;
    cudaMalloc((void**)&dev_a, N * sizeof(int));
    for(int i = 0 ; i < N ; i++) {
        host_a[i] = i;
    }
    cudaMemcpy(dev_a, host_a, N * sizeof(int), cudaMemcpyHostToDevice);
    f<<<1, N>>>(dev_a);
    cudaMemcpy(host_a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
    for(int i = 0 ; i < N ; i++) {
        printf("%d ", host_a[i]);
    }
}
 
     
    