I am new to C++/CUDA. I tried implementing the parallel algorithm "reduce" with ability to handle any type of inputsize, and threadsize without increasing asymptotic parallel runtime by recursing over the output of the kernel (in the kernel wrapper).
e.g. Implementing Max Reduce in Cuda the top answer to this question, his/hers implementation will essentially be sequential when threadsize is small enough.
However, I keep getting a "Segmentation fault" when I compile and run it ..?
>> nvcc -o mycode mycode.cu
>> ./mycode
Segmentail fault.
Compiled on a K40 with cuda 6.5
Here is the kernel, basically same as the SO post I linked the checker for "out of bounds" is different:
#include <stdio.h>
/* -------- KERNEL -------- */
__global__ void reduce_kernel(float * d_out, float * d_in, const int size)
{
  // position and threadId
  int pos = blockIdx.x * blockDim.x + threadIdx.x;
  int tid = threadIdx.x;
  // do reduction in global memory
  for (unsigned int s = blockDim.x / 2; s>0; s>>=1)
  {
    if (tid < s)
    {
      if (pos+s < size) // Handling out of bounds
      {
        d_in[pos] = d_in[pos] + d_in[pos+s];
      }
    }
  }
  // only thread 0 writes result, as thread
  if (tid==0)
  {
    d_out[blockIdx.x] = d_in[pos];
  }
}
The kernel wrapper I mentioned to handle when 1 block will not contain all of the data.
/* -------- KERNEL WRAPPER -------- */
void reduce(float * d_out, float * d_in, const int size, int num_threads)
{
  // setting up blocks and intermediate result holder
  int num_blocks = ((size) / num_threads) + 1;
  float * d_intermediate;
  cudaMalloc(&d_intermediate, sizeof(float)*num_blocks);
  // recursively solving, will run approximately log base num_threads times.
  do
  {
    reduce_kernel<<<num_blocks, num_threads>>>(d_intermediate, d_in, size);
    // updating input to intermediate
    cudaMemcpy(d_in, d_intermediate, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
    // Updating num_blocks to reflect how many blocks we now want to compute on
      num_blocks = num_blocks / num_threads + 1;
    // updating intermediate
    cudaMalloc(&d_intermediate, sizeof(float)*num_blocks);
  }
  while(num_blocks > num_threads); // if it is too small, compute rest.
  // computing rest
  reduce_kernel<<<1, num_blocks>>>(d_out, d_in, size);
}
Main program to initialize in/out and create bogus data for testing.
/* -------- MAIN -------- */
int main(int argc, char **argv)
{
  // Setting num_threads
  int num_threads = 512;
  // Making bogus data and setting it on the GPU
  const int size = 1024;
  const int size_out = 1;
  float * d_in;
  float * d_out;
  cudaMalloc(&d_in, sizeof(float)*size);
  cudaMalloc((void**)&d_out, sizeof(float)*size_out);
  const int value = 5;
  cudaMemset(d_in, value, sizeof(float)*size);
  // Running kernel wrapper
  reduce(d_out, d_in, size, num_threads);
  printf("sum is element is: %.f", d_out[0]);
}
 
     
    