I am puzzled by the behaviour of the following snippet:
 #include <stdio.h>
// kernel
__global__ void CheckAddressing(float * d_Result, int numCols, int numRows)
{
    printf("%d\n", threadIdx.x);
    if(threadIdx.x<16)
    {
        d_Result[threadIdx.x]=float(364.66);
    }
}
////////
int main(int argc, char ** argv)
{
    int TotalSize = 16;
    float * d_Result;
    float * h_Result;
        cudaSetDevice(0);
    h_Result = (float *)malloc(TotalSize*sizeof(float));
    cudaMalloc((void **) &d_Result, TotalSize*sizeof(float));
    CheckAddressing<<<dim3(1),dim3(16)>>>(d_Result, 8,8);
    cudaMemcpy(h_Result, d_Result, TotalSize*sizeof(float), cudaMemcpyDeviceToHost);
    for(int n=0; n<16; n++)
    {
        printf("%f\t", h_Result[n]);
    }
        printf("\n");
// free GPU memory
        cudaFree(d_Result);
        free(h_Result);
    return 0;
}
It works on one machine (I compile with nvcc -arch=sm_30) and returns 364.66 (16 times). However on another machine running Cuda 5.5 it returns all zeros. Any idea what can be happening?
UPDATE:
cuda-memcheck ./test
========= CUDA-MEMCHECK
0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    0.000000    
========= ERROR SUMMARY: 0 errors
nvidia-smi
Fri Apr 18 14:45:05 2014       
+------------------------------------------------------+                       
| NVIDIA-SMI 331.44     Driver Version: 331.44         |                       
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla K20Xm         Off  | 0000:02:00.0     Off |                    0 |
| N/A   20C    P0    50W / 235W |     11MiB /  5759MiB |     99%      Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Compute processes:                                               GPU Memory |
|  GPU       PID  Process name                                     Usage      |
|=============================================================================|
|  No running compute processes found                                         |
+-----------------------------------------------------------------------------+
 
     
    