I am trying to implement a tridiagonal system solver based on the Cyclic Reduction method on my GTS450. 
Cyclic Reduction is illustrated in this paper
Y. Zhang, J. Cohen, J.D. Owens, "Fast Tridiagonal Solvers on GPU"
However, whatever I do, my CUDA code is far slower than the sequential counterpart. My result for a total of 512 x 512 points is 7ms, however on my i7 3.4GHz it is 5ms. The GPU is not accelerating!
Which could be the problem?
#include "cutrid.cuh"
 __global__ void cutrid_RC_1b(double *a,double *b,double *c,double *d,double *x)
{
 int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
 int idx=threadIdx.x;
 __shared__ double asub[512];
 __shared__ double bsub[512];
 __shared__ double csub[512];
 __shared__ double dsub[512];
 double at=0;
 double bt=0;
 double ct=0;
 double dt=0;
 asub[idx]=a[idx_global];
 bsub[idx]=b[idx_global];
 csub[idx]=c[idx_global];
 dsub[idx]=d[idx_global];
 for(int stride=1;stride<N;stride*=2)
  {
    int margin_left,margin_right;
    margin_left=idx-stride;
    margin_right=idx+stride;
    at=(margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f; 
    bt=bsub[idx]+((margin_left>=0)?(-csub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
    -((margin_right<512)?asub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f); 
    ct=(margin_right<512)?(-csub[idx+stride]*asub[idx]/bsub[idx+stride]):0.f; 
    dt=dsub[idx]+((margin_left>=0)?(-dsub[idx-stride]*asub[idx]/bsub[idx-stride]):0.f)
    -((margin_right<512)?dsub[idx+stride]*csub[idx]/bsub[idx+stride]:0.f); 
    __syncthreads();
    asub[idx]=at;
    bsub[idx]=bt;
    csub[idx]=ct;
    dsub[idx]=dt;
    __syncthreads();
  }
x[idx_global]=dsub[idx]/bsub[idx];
}/*}}}*/
I launched this kernel by cutrid_RC_1b<<<512,512>>>(d_a,d_b,d_c,d_d,d_x), and reached 100% device occupancy. This result has puzzled me for days. 
There is an improved version of my code:
    #include "cutrid.cuh"
    __global__ void cutrid_RC_1b(float *a,float *b,float *c,float *d,float *x)
    {/*{{{*/
     int idx_global=blockIdx.x*blockDim.x+threadIdx.x;
     int idx=threadIdx.x;
     __shared__ float asub[512];
     __shared__ float bsub[512];
     __shared__ float csub[512];
     __shared__ float dsub[512];
    asub[idx]=a[idx_global];
    bsub[idx]=b[idx_global];
    csub[idx]=c[idx_global];
    dsub[idx]=d[idx_global];
 __syncthreads();
   //Reduction  
    for(int stride=1;stride<512;stride*=2)
    {
        int margin_left=(idx-stride);
        int margin_right=(idx+stride);
        if(margin_left<0) margin_left=0;
        if(margin_right>=512) margin_right=511;
        float tmp1 = asub[idx] / bsub[margin_left];
        float tmp2 = csub[idx] / bsub[margin_right];
        float tmp3 = dsub[margin_right];
        float tmp4 = dsub[margin_left];
        __syncthreads();
        dsub[idx] = dsub[idx] - tmp4*tmp1-tmp3*tmp2;
        bsub[idx] = bsub[idx]-csub[margin_left]*tmp1-asub[margin_right]*tmp2;
        tmp3 = -csub[margin_right]; 
        tmp4 = -asub[margin_left];
        __syncthreads();
        asub[idx] = tmp3*tmp1;
        csub[idx] = tmp4*tmp2;
        __syncthreads();
     }
        x[idx_global]=dsub[idx]/bsub[idx];
    }/*}}}*/
The speed is improved to 0.73ms on a Quadro k4000 for 512 x 512 system, however the code in the mentioned paper runs in 0.5ms on a GTX280.
 
     
    