CUDA : program which doesn't work with every size

Question

I'm working on a 3D Laplacian. My code is successful with the size N=32 but with N=64 or N=128 I've some incorrect results:

#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include"res3dcb.cuh"
#include <math.h>
using namespace std;

// Let's start the main program.
int main(void) {

// Choice of N.
int N;
cout<<"Choose matrix dimension (32, 64 or 128)"<<endl;
cin>>N;
int size=(N+2)*(N+2)*(N+2)*sizeof(float);

// Variable statement.
struct timeval t1, t2;
float *x_d, *y_d; 
float *x,*y; 
float gflops;
float NumOps;


//Init x and y.
x = new float[size];
y = new float[size];

for (int i=1;i<N+1;i++)
    for (int j=1;j<N+1;j++) 
        for (int k=1;k<N+1;k++) { 
            x[i*(N+2)*(N+2)+j*(N+2)+k]=1;
        }

// Shadow cases.
for (int i=1;i<N+1;i++) {
    for (int j=1;j<N+1;j++) { 
        x[i*(N+2)*(N+2)+j*(N+2)]=x[i*(N+2)*(N+2)+j*(N+2)+1]; 
        x[i*(N+2)*(N+2)+j*(N+2)+N+1]=x[i*(N+2)*(N+2)+j*(N+2)+N];
    }

for (int k=0;k<N+2;k++) { 
    x[i*(N+2)*(N+2)+k]=x[i*(N+2)*(N+2)+(N+2)+k]; 
    x[i*(N+2)*(N+2)+(N+1)*(N+2)+k]=x[i*(N+2)*(N+2)+N*(N+2)+k];}
}

for (int j=0;j<N+2;j++) 
    for (int k=0;k<N+2;k++) {
        x[(N+2)*j+k]=x[(N+2)*(N+2)+(N+2)*j+k];
        x[(N+1)*(N+2)*(N+2)+(N+2)*j+k]=x[(N+2)*(N+2)*N+(N+2)*j+k];
    }

// Display of initial matrix.
int id_stage=-2;
while (id_stage!=-1) {
    cout<<"Which initial matrix's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;

if (id_stage != -1) {
    cout<<"Etage "<<id_stage<<" du cube :"<<endl;
    for (int j=0;j<N+2;j++) {
        cout<<"| ";
        for (int k=0;k<N+2;k++) {cout<<x[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
        cout<<"|"<<endl;
    }
    cout<<endl;
    }
}


// CPU to GPU.
cudaMalloc( (void**) & x_d, size);
cudaMalloc( (void**) & y_d, size);

cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;

// Solver parameters.
dim3 dimGrid(N/32, N/8, N/8);
dim3 dimBlock(16, 8, 8);


// Solver loop.
gettimeofday(&t1, 0);
res3d<<<dimGrid, dimBlock>>>(x_d, y_d, N); 
cudaDeviceSynchronize();
gettimeofday(&t2, 0);
double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0;


// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time));

// GPU to CPU.
cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
cudaFree(x_d);
cudaFree(y_d);

// Display of final matrix.
id_stage=-2;
while (id_stage!=-1) {
    cout<<"Which output's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
    cin>>id_stage;
    cout<<endl;

if (id_stage != -1) {
    cout<<"Etage "<<id_stage<<" du cube :"<<endl;
    for (int j=0;j<N+2;j++) {
        cout<<"| ";
        for (int k=0;k<N+2;k++) {cout<<y[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
        cout<<"|"<<endl;
    }
    cout<<endl;
}
}



cout<<"Time : "<<time<<endl;
cout<<"Gflops/s : "<<gflops<<endl;
}

Where :

#ifndef RES2D_MAT_GPU_HPP
#define RES2D_GPU_HPP
#include <iostream>
#include <sys/time.h>
#include <cuda.h>

__global__ void res3d(volatile float* x, float* y, int N) 
{
// Variable statement.
__shared__ float sdata[18][10][10];
__shared__ float idata[18][10][10];

int tid = threadIdx.x+1;
int tjd = threadIdx.y+1;
int tkd = threadIdx.z+1;
int i = threadIdx.x + blockIdx.x*(blockDim.x)+1;
int j = threadIdx.y + blockIdx.y*(blockDim.y)+1;
int k = threadIdx.z + blockIdx.z*(blockDim.z)+1;

// Overloading of shared variable's outlines.
float data=0,data1=0;

if (threadIdx.x==0) {
    data += x[(N+2)*(N+2)*(i-1)+(N+2)*j+k];
    data1 += x[(N+2)*(N+2)*(i-1)+(N+2)*j+k+N*(N+2)*(N+2)/2];
}
if (threadIdx.x==15) {
    data += x[(N+2)*(N+2)*(i+1)+(N+2)*j+k];
    data1 += x[(N+2)*(N+2)*(i+1)+(N+2)*j+k+N*(N+2)*(N+2)/2];
}
if (threadIdx.y==0) {
    data += x[(N+2)*(N+2)*i+(N+2)*(j-1)+k];
    data1 += x[(N+2)*(N+2)*i+(N+2)*(j-1)+k+N*(N+2)*(N+2)/2];
}
if (threadIdx.y==7) {
    data += x[(N+2)*(N+2)*i+(N+2)*(j+1)+k]; 
    data1 += x[(N+2)*(N+2)*i+(N+2)*(j+1)+k+N*(N+2)*(N+2)/2];    
}   
if (threadIdx.z==0) {
    data += x[(N+2)*(N+2)*i+(N+2)*j+k-1];
    data1 += x[(N+2)*(N+2)*i+(N+2)*j+k-1+N*(N+2)*(N+2)/2];  
}   
if (threadIdx.z==7) {
     data += x[(N+2)*(N+2)*i+(N+2)*j+k+1];
     data1 += x[(N+2)*(N+2)*i+(N+2)*j+k+1+N*(N+2)*(N+2)/2]; 
}

// Init shared variable.
sdata[tid][tjd][tkd] = x[(N+2)*(N+2)*i+(N+2)*j+k];
idata[tid][tjd][tkd]=x[(N+2)*(N+2)*i+(N+2)*j+k+N*(N+2)*(N+2)/2];

__syncthreads();

// (small) tiling.
y[(N+2)*(N+2)*i+(N+2)*j+k] = sdata[tid][tjd+1][tkd] 
               + sdata[tid][tjd-1][tkd] 
               + sdata[tid][tjd][tkd+1] 
               + sdata[tid][tjd][tkd-1] 
               + sdata[tid+1][tjd][tkd] 
               + sdata[tid-1][tjd][tkd] 
               - 6*sdata[tid][tjd][tkd]+data; 

y[(N+2)*(N+2)*i+(N+2)*j+k+N*(N+2)*(N+2)/2] = idata[tid][tjd+1][tkd] 
               + idata[tid][tjd-1][tkd] 
               + idata[tid][tjd][tkd+1] 
               + idata[tid][tjd][tkd-1] 
               + idata[tid+1][tjd][tkd] 
               + idata[tid-1][tjd][tkd] 
               - 6*idata[tid][tjd][tkd]+data1;

}
#endif

Questions :

Is my code erroneous? Or is it a problem from GPU's architecure if results are false with N=64 and N=128?
Does "if" is the good way to overloading shared variable's outlines?

Thanks in advance for your help.

What so you mean with 'the results are false'? A data structure contains unexpected data? which one? At which stage? — Lorenzo Belli, Sep 30 '15 at 11:26
In fact, I'm supposed to have only 0 as result. When N=32 it's the case, but if N=64 or 128 some 1 and 2 appears, like that : [link](http://img11.hostingpics.net/pics/358254Capturedu20150930133216.jpg). An exemple for N=128 — Loïc Madiès, Sep 30 '15 at 11:34
[Proper CUDA error checking?](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) — Jez, Sep 30 '15 at 11:52
When you call `new` for `x` and `y`, why do you give it `size` that only makes sense for `malloc`? — downhillFromHere, Sep 30 '15 at 11:58

Paul R · Answer 1 · 2015-09-30T12:08:15.853

2

You have a mistake here:

dim3 dimGrid(N/32, N/8, N/8);
dim3 dimBlock(16, 8, 8);

This should be:

dim3 dimGrid(N/16, N/8, N/8);
dim3 dimBlock(16, 8, 8);

Also, as noted in the comments, you are over-allocating memory here:

x = new float[size];
y = new float[size];

since size has been calculated in bytes, not elements.

edited Sep 30 '15 at 12:08

answered Sep 30 '15 at 12:01

Paul R

208,748
37
389
560

score 0 · Answer 2 · answered Oct 09 '15 at 12:19

Well, I found the mistake. DimGrid and DimBlock wasn't wrong because I was tilling on the x axis.

The mistake is my "if" inside the global kernel. Here is an algorithm with better performance and right results :

#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include <math.h>
#include"reslap3D.cu"
using namespace std;

// Let's start the main program.
int main(void) {

// Variable statement.
struct timeval t1, t2;
float gflops;
float NumOps;
double time;
long int N=128;
int size=(N+2);
int size3=size*size*size*sizeof(float);
float *x = new float[size3];
float *y = new float[size3];
float *d_x;
float *d_y;

//Init x.
for (int i=1;i<N+1;i++) 
    for (int j=1;j<N+1;j++) 
        for (int k=1;k<N+1;k++)
            x[size*size*i+size*j+k]=cos(k);

// Shadow cells.
for (int i=1;i<N+1;i++) {
    for (int j=1;j<N+1;j++) { x[i*(N+2)*(N+2)+j*(N+2)]=x[i*(N+2)*(N+2)+j*(N+2)+1]; x[i*(N+2)*(N+2)+j*(N+2)+N+1]=x[i*(N+2)*(N+2)+j*(N+2)+N];}

    for (int k=0;k<N+2;k++) { x[i*(N+2)*(N+2)+k]=x[i*(N+2)*(N+2)+(N+2)+k]; x[i*(N+2)*(N+2)+(N+1)*(N+2)+k]=x[i*(N+2)*(N+2)+N*(N+2)+k];}
}

// CPU to GPU.
cudaMalloc((void **) &d_x, size3);
cudaMalloc((void **) &d_y, size3);
cudaMemcpy(d_x, x, size3, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, size3, cudaMemcpyHostToDevice);

// Solver parameters.
dim3 dimBlock(2, 2, 64);
dim3 dimGrid(64, 64);

// Solver loop.
gettimeofday(&t1, 0);
kernel1 <<<dimGrid, dimBlock>>> (d_x, d_y, size, N);
cudaDeviceSynchronize();
gettimeofday(&t2, 0);
time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0;

// GPU to CPU.  
cudaMemcpy(y, d_y, size3, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);

// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time));

// Display of final matrix.
int id_stage=-2;
while (id_stage!=-1) {
    cout<<"Which output's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
    cin>>id_stage;
    cout<<endl;

    if (id_stage != -1) {
        cout<<"Stage "<<id_stage<<" of cube :"<<endl;
        for (int j=0;j<N+2;j++) {
            cout<<"| ";
            for (int k=0;k<N+2;k++) {cout<<y[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
            cout<<"|"<<endl;
        }
        cout<<endl;
    }
}

// Display of performances.
cout<<"Time : "<<time<<endl;
cout<<"Gflops/s : "<<gflops<<endl;      
}

With reslap3D.cu :

#define D(x,y,z) size*size*(x)+size*(y)+z
__global__ void kernel1(float *x, float *y, int size, int N)
{
__shared__ float sdata0[4][4][66];
__shared__ float sdata64[4][4][66];

int c0 = blockIdx.x*blockDim.x + threadIdx.x+1;
int c1 = blockIdx.y*blockDim.y + threadIdx.y+1;
int c2 = threadIdx.z+1;
int i = threadIdx.x+1, j = threadIdx.y+1, k = threadIdx.z+1;

if (threadIdx.x == 0) 
{ sdata0[i-1][j][k] = x[D(c0-1,c1,c2)];     
  sdata64[i-1][j][k] = x[D(c0-1,c1,c2+64)]; 
}
if (threadIdx.x == 1) 
{ sdata0[i+1][j][k] = x[D(c0+1,c1,c2)];     
  sdata64[i+1][j][k] = x[D(c0+1,c1,c2+64)]; 
}

if (threadIdx.y == 0) 
{ sdata0[i][j-1][k] = x[D(c0,c1-1,c2)]; 
  sdata64[i][j-1][k] = x[D(c0,c1-1,c2+64)]; 
}
if (threadIdx.y == 1) 
{ sdata0[i][j+1][k] = x[D(c0,c1+1,c2)]; 
  sdata64[i][j+1][k] = x[D(c0,c1+1,c2+64)]; 
}

if (threadIdx.z == 0) 
{ sdata0[i][j][k-1] = x[D(c0,c1,c2-1)]; 
  sdata64[i][j][k-1] = x[D(c0,c1,c2+63)]; 
}
if (threadIdx.z == 63) 
{ sdata0[i][j][k+1] = x[D(c0,c1,c2+1)]; 
  sdata64[i][j][k+1] = x[D(c0,c1,c2+65)]; 
}


sdata0[i][j][k] = x[D(c0,c1,c2)];

sdata64[i][j][k] = x[D(c0,c1,c2+64)];

__syncthreads();

             y[D(c0, c1, c2)] = sdata0[i+1][j][k]
                      + sdata0[i-1][j][k]
                          + sdata0[i][j+1][k] 
                      + sdata0[i][j-1][k]
                      + sdata0[i][j][k+1] 
                      + sdata0[i][j][k-1]

                      - 6 * sdata0[i][j][k];

             y[D(c0, c1, c2+64)] = sdata64[i+1][j][k]
                      + sdata64[i-1][j][k]
                          + sdata64[i][j+1][k] 
                      + sdata64[i][j-1][k]
                      + sdata64[i][j][k+1] 
                      + sdata64[i][j][k-1]

                      - 6 * sdata64[i][j][k];

}

CUDA : program which doesn't work with every size

2 Answers2