This is my sequential code:
float foo(float* in1, float* in2, float in3, unsigned int size) {
    float tmp = 0.f;
        for (int i = 0; i<size; i++)
          if(in2[i]>0)tmp += (in1[i]/in3 - (in2[i] /in3)*(in2[i] /in3));
    return tmp;
}
This is my effort to port it to CUDA:
__global__ void kernel_foo(float* tmp, const float* in1, const float* 
                           in2, float in3,  unsigned int size) {
    unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size) {
        if(in2[i]>0){
        atomicAdd(tmp, in1[i]/in3 - (in2[i] /in3)*(in2[i] /in3));
        }
    }
}
void launch_kernel_foo(float* tmp, const float* in1, const float* in2,
                       float in3,  unsigned int size) {
  kernel_foo<<<(size+255)/256,256>>>(tmp, in1, in2, in3, size);
}
but it does't work to generate correct results. Could anyone tell me where is the mistake?
 
     
     
    