I am trying to learn the usage of shared memory to increase the performance. Here I am trying to copy data from global memory to shared memory. But when I have a single block (256 threads) it gives the right result while with more than one block it gives a random result.
#include <cuda.h>
#include <stdio.h>
__global__ void staticReverse(int *d, int n)
{
  __shared__ int s[400];
  int t = blockIdx.x * blockDim.x + threadIdx.x;
  d[t] = d[t]*d[t];
  s[t] =d[t];
  __syncthreads();
  d[t] = s[t];  
}
__global__ void dynamicReverse(int *d, int n)
{
  extern __shared__ int s[];
  int t = threadIdx.x;
  
  s[t] = d[t]*d[t];
  __syncthreads();
  d[t] = s[t];
}
int main(void)
{
  const int n = 400;
  int a[n], d[n];
  for (int i = 0; i < n; i++)
  {
    a[i] = i; 
  }
  int *d_d;
  cudaMalloc(&d_d, n * sizeof(int)); 
  // run version with static shared memory
  int block_size = 256;
  int n_blocks = n/block_size + (n%block_size == 0 ? 0:1);
  cudaMemcpy(d_d, a, n*sizeof(int), cudaMemcpyHostToDevice);
  staticReverse<<<n_blocks,block_size>>>(d_d, n);
  cudaMemcpy(d, d_d, n*sizeof(int), cudaMemcpyDeviceToHost);
  for (int i = 0; i < n; i++) 
  {
    printf("%d\n",d[i]);
  }
}
- What does the third argument in the - dynamicReverse<<<n_blocks,block_size,n*sizeof(int)>>>(d_d, n);- kernel launch do? Does it allocat shared memory for the entire block or per thread? 
- If I require more than 64kb of shared memory per multiprocessor with compute capability 5.0, what do I need to do? 
 
     
    