all:
I am learning how shared memory accelerates the GPU programming process. I am using the codes below to calculate the squared value of each element plus the squared value of the average of its left and right neighbors. The code runs, however, the result is not as expected.
The first 10 result printed out is 0,1,2,3,4,5,6,7,8,9, while I am expecting the result as 25,2,8, 18,32,50,72,98,128,162;
The code is as follows, with the reference to here;
Would you please tell me which part goes wrong? Your help is very much appreciated.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <cuda.h>
const int N=1024;
 __global__ void compute_it(float *data)
 {
 int tid = threadIdx.x;
 __shared__ float myblock[N];
 float tmp;
 // load the thread's data element into shared memory
 myblock[tid] = data[tid];
 // ensure that all threads have loaded their values into
 // shared memory; otherwise, one thread might be computing
 // on unitialized data.
 __syncthreads();
 // compute the average of this thread's left and right neighbors
 tmp = (myblock[tid>0?tid-1:(N-1)] + myblock[tid<(N-1)?tid+1:0]) * 0.5f;
 // square the previousr result and add my value, squared
 tmp = tmp*tmp + myblock[tid]*myblock[tid];
 // write the result back to global memory
 data[tid] = myblock[tid];
 __syncthreads();
  }
int main (){
char key;
float *a;
float *dev_a;
a = (float*)malloc(N*sizeof(float));
cudaMalloc((void**)&dev_a,N*sizeof(float));
for (int i=0; i<N; i++){
a [i] = i;
}
cudaMemcpy(dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice);
compute_it<<<N,1>>>(dev_a);
cudaMemcpy(a, dev_a, N*sizeof(float), cudaMemcpyDeviceToHost);
for (int i=0; i<10; i++){
std::cout<<a [i]<<",";
}
std::cin>>key;
free (a);
free (dev_a);
 
     
     
    