Incorrect addition of Prime numbers in CUDA

Question

I use reduction logic in code by referring How to find the sum of array in CUDA by reduction.

But It is giving some errors. I am not getting my mistake, could you please help me out??

required specification: 1.Cuda toolkit v6.5 2. graphics: GTX 210 (compute capability 1.2) 3. visual studio 2013

#include<stdio.h>
#include<cuda.h>
#include<malloc.h>
#include<conio.h>
#include<time.h>
#include<windows.h>

#define SIZE 10
#define N 100

__global__ void vectoreAdd(int *d_a, int *d_b, int *d_c)
{
    __shared__ int sdata[256];

    int i = threadIdx.x + (blockIdx.x*blockDim.x);

    sdata[threadIdx.x] = d_a[i];
    __syncthreads();

    if (i<SIZE)

    for (i = 2; i<SIZE; i++)
    {
        int counter = 0;
        for (int j = 2; j<d_a[i]; j++)
        {
            if (d_a[i] % j == 0)
            {
            counter = 1; break;
            }
        }
if (counter == 0)
        {
            d_b[i] = d_a[i];
        }

    }
    // do reduction in shared mem
    for (int s = 1; s < blockDim.x; s *= 2)
    {
        int index = 2 * s * threadIdx.x;;

        if (index < blockDim.x)
        {
            sdata[index] += sdata[index + s];
        }
        __syncthreads();
    }

    // write result for this block to global mem
    if (threadIdx.x == 0)
        atomicAdd(d_c, sdata[0]);
}

}
int main()
{
    clock_t tic = clock();
    int *a, *b, *summation=0, sum = 0,count=-1;       //declare summation as double/long if needed
    int *d_a, *d_b, *d_c;

    //int blocks, block_size = 512;

    int size = N * sizeof(int); 

    a = (int *)malloc(SIZE*sizeof(int));
    b = (int *)malloc(SIZE*sizeof(int));
    summation = (int *)malloc(SIZE*sizeof(int));


    cudaMalloc((void**)&d_a, SIZE * sizeof(int));
    cudaMalloc((void**)&d_b, SIZE * sizeof(int));
    cudaMalloc((void**)&d_c, SIZE * sizeof(int));


    for (int i = 1; i<SIZE; i++)
    {
        a[i] = i;
        b[i] = 0;

    }

    cudaMemcpy(d_a, a, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_c, c, SIZE*sizeof(int), cudaMemcpyHostToDevice);
    /*blocks = SIZE / block_size;
    if (SIZE% block_size != 0)
        blocks++;   */

    dim3 blocksize(256); // create 1D threadblock
    dim3 gridsize(N / blocksize.x);  //create 1D grid

    vectoreAdd << < gridsize, blocksize >> >(d_a, d_b, d_c);


    //cudaThreadSynchronize();

    cudaMemcpy(b, d_b, SIZE*sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(summation, d_c, SIZE*sizeof(int), cudaMemcpyDeviceToHost);

    for (int m = 0; m < SIZE; m++)
    {
        if (b[m] != 0)
        {
            printf("\n prime no is:%d", b[m]);
            count = count + 1;
        }
    }
    printf("\n\n Total prime no. are: %d", count);
/*      for (int j = 1; j<SIZE; j++)
    {
        sum = sum + b[j];
    }*/

    printf("\n \nsum of all prime no upto %d is:%d", SIZE, summation);

    clock_t toc = clock();
    printf("\n\nElapsed: %f seconds\n", (double)(toc - tic) / CLOCKS_PER_SEC);

    free(a);    free(b);    free(summation);
    cudaFree(d_a);      cudaFree(d_b);      cudaFree(d_c);

    getchar();  return 0;
}

What is the error? Most probably the problem is due to the fact that the value of `N` is too small (smaller than `blocksize.x`) which eventually results in creating a grid of size `0` due to integer division ( `100 / 256` is equal to `0` ). — sgarizvi, Oct 19 '16 at 11:22
You should always use [proper CUDA error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) any time you are having trouble with a CUDA code, and run your code with `cuda-memcheck`, *before* asking for help. Even if you don't understand the error output, it will be useful for others trying to help you. — Robert Crovella, Oct 19 '16 at 11:47
in above program I used only one kernel so it was confusing for that whether to do reduction addition or find prime no..so I create two different kernels in modified code — Rupali, Oct 20 '16 at 06:40
I modify my code using two kernels http://stackoverflow.com/questions/40147773/prime-no-addition-program-in-cuda — Rupali, Oct 20 '16 at 07:14

score -1 · Answer 1 · edited Jun 20 '20 at 09:12

-1

There are lots of mistakes in your code :

cudaMalloc((void**)&d_a, SIZE * sizeof(int));

should be :

cudaMalloc((void**)&d_a, N * sizeof(int)); //OR

cudaMalloc((void**)&d_a, size);

as you already calculated but didnt passed it. same in case of malloc() //Host code

edited Jun 20 '20 at 09:12

Community

1
1

answered Oct 20 '16 at 07:22

Akshay Mahajan

1
4

Incorrect addition of Prime numbers in CUDA

1 Answers1