I am new to CUDA programming. I am performing a simple power iteration method to get the dominant eigen vector, but whenever I increase my matrix size to a value greater than the number of threads per block, I get a wrong answers. I am unable to understand why this is happening.
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#define N 100
#define THREADS_PER_BLOCK 256
__global__ void power_iteration(float *d_A, float *d_x , float *d_temp ) 
{
    int idx = threadIdx.x + blockDim.x * blockIdx.x;
    int iterations = 0;
    float old = 0.0f;
    float result = 0.0f;
    float error =0;
    float tolerance = 1e-6;
       
    //for (int i = 0; i < iterations; ++i) 
    
    while(error>tolerance || iterations ==0)
    {
        iterations++;
        printf("%d\n" ,iterations);
            
        // Matrix-vector multiplication
        if (idx < N)
        {
            //printf("%d\n" , idx);
            float sum = 0.0f;
            #pragma unroll
            for (int j = 0; j < N; j++)
            {
                sum += d_A[idx * N + j] * d_x[j];
            }
            d_x[idx] = sum;
            __syncthreads();
        }
        
        // normalize    
        if(idx<N)          
            d_x[idx] = d_x[idx] / d_x[N-1];
        __syncthreads();
        
        //calculate eigenvalue
        if (idx == 0)
        {
            float numerator = 0.0f;
            float denominator = 0.0f;
            #pragma unroll
            for (int i = 0; i < N; i++)
            {
                numerator += d_temp[i] * d_x[i];
                denominator += d_x[i] * d_x[i];
            }
            result = numerator / denominator;
        }
        
        error = (float) abs(result - old);
        
        old = result;
        if(idx<N)
            d_temp[idx] = d_x[idx];
    }
}
int main() {
    //declare host and device variable
    float *h_A , *d_A ;
    
    // allocate memory
    cudaMalloc((void **)&d_A, N * N * sizeof(float));
    h_A = (float *)malloc(N*N*sizeof(float));
    
    //intialize host matrix
    for(int i = 0 ; i<N ; i++)
    {
        for(int j =0 ; j< N ; j++)
            //scanf("%f",&h_A[i*N+j]);
            h_A[i*N+j] = (float) i/N;
    }
        
    //declare host and device variable
    float *h_x , *d_x , *d_temp;
   
    // allocate memory
    cudaMalloc((void **)&d_x, N * sizeof(float));
    cudaMalloc((void **)&d_temp, N * sizeof(float));
    h_x = (float *)malloc(N*sizeof(float));
   
    //storing intial guess
    for (int i = 0; i < N; ++i) {
        h_x[i] = 1.0;
    }
        
    // copy to device memory    
    cudaMemcpy(d_A, h_A, N*N*sizeof(float) , cudaMemcpyHostToDevice); 
   
    cudaMemcpy(d_x, h_x, N*sizeof(float) , cudaMemcpyHostToDevice); 
    cudaMemcpy(d_temp, h_x, N*sizeof(float) , cudaMemcpyHostToDevice); 
          
    // Launch the kernel
    
    int numBlocks = (N+THREADS_PER_BLOCK-1) / (THREADS_PER_BLOCK);
    int numThreads=(THREADS_PER_BLOCK);
   
    power_iteration<<<numBlocks, numThreads>>>(d_A,d_x,d_temp);
    // Synchronize and handle the result
    cudaDeviceSynchronize();
    
    cudaMemcpy(h_x, d_x, N*sizeof(float), cudaMemcpyDeviceToHost); 
    
    for(int i = 0 ; i<N ; i++)
    {
        printf("%f ", h_x[i]);
    }
    printf("\n");
    
    cudaFree(d_x);
    cudaFree(d_temp);
    free(h_x); 
    cudaFree(d_A);
    free(h_A);
    
    return 0;
}
Here I have performed matrix-vector multiplication and am running through the loop until the error is less than the tolerance. I get good results for N < THREADS_PER_BLOCK but when I exceed it, I get wrong results.
 
    