So I just started writing in CUDA, following the An Even Easier Introduction to CUDA guide. So far so good. Then I wanted to implement a neural network, which gave me quite some calls to the function cudaMallocManaged(). So, to make things more readable, I decided to put these in a different function called allocateStuff() (see code below). When I then run the program using nvprof this does not show the GPU time of layerInit() and instead gives the following warning:
Warning: 1 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size.
However, when I put the code in the allocateStuff() function directly in main(), the warning does not happen and the GPU time for layerInit() is shown. So now my question is: what do I do wrong in this function, or else what is the reason that it (apparently) overflows the buffer?
Code:
#include <cuda_profiler_api.h>
#include <iostream>
#include <vector>
__global__
void layerInit(const unsigned int firstNodes,
               const unsigned int secondNodes,
               const unsigned int resultNodes,
               float *firstLayer,
               float *secondLayer,
               float *resultLayer) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   for (unsigned int i = index; i < firstNodes; i += stride) {
      firstLayer[i] = 0.0f;
   }
   for (unsigned int i = index; i < secondNodes; i += stride) {
      secondLayer[i] = 0.0f;
   }
   for (unsigned int i = index; i < resultNodes; i += stride) {
      resultLayer[i] = 0.0f;
   }
}
void allocateStuff(const unsigned int firstNodes,
                   const unsigned int secondNodes,
                   const unsigned int resultNodes,
                   float *firstLayer,
                   float *secondLayer,
                   float *resultLayer,
                   std::vector<float*> &firstWeightLayer,
                   std::vector<float*> &secondWeightLayer) {
   cudaMallocManaged(&firstLayer,  firstNodes  * sizeof(float));
   cudaMallocManaged(&secondLayer, secondNodes * sizeof(float));
   cudaMallocManaged(&resultLayer, resultNodes * sizeof(float));
   for (auto& nodeLayer : firstWeightLayer) {
      cudaMallocManaged(&nodeLayer, secondNodes * sizeof(float));
   }
   for (auto& nodeLayer : secondWeightLayer) {
      cudaMallocManaged(&nodeLayer, resultNodes * sizeof(float));
   }
}
template<typename T, typename... Args>
void freeStuff(T *t) {
   cudaFree(t);
}
template<typename T, typename... Args>
void freeStuff(T *t, Args... args) {
   freeStuff(&t);
   freeStuff(args...);
}
void freeStuff(std::vector<float*> &vec) {
   for (auto& v : vec) {
      freeStuff(&v);
   }
}
int main () {
   unsigned int firstNodes = 5, secondNodes = 3, resultNodes = 1;
   float *firstLayer = new float[firstNodes];
   float *secondLayer = new float[secondNodes];
   float *resultLayer = new float[resultNodes];
   std::vector<float*> firstWeightLayer(firstNodes, new float[secondNodes]);
   std::vector<float*> secondWeightLayer(secondNodes, new float[resultNodes]);
   allocateStuff(firstNodes, secondNodes, resultNodes,
                 firstLayer, secondLayer, resultLayer,
                 firstWeightLayer,secondWeightLayer);
   layerInit<<<1,256>>>(firstNodes,
                        secondNodes,
                        resultNodes,
                        firstLayer,
                        secondLayer,
                        resultLayer);
   cudaDeviceSynchronize();
   freeStuff(firstLayer, secondLayer, resultLayer);
   freeStuff(firstWeightLayer);
   freeStuff(secondWeightLayer);
   cudaProfilerStop();
   return 0;
}
Output of nvprof ./executable with function allocateStuff():  
==18608== NVPROF is profiling process 18608, command: ./executable
==18608== Profiling application: ./executable
==18608== Warning: 1 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size.
==18608== Profiling result:
No kernels were profiled.
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
      API calls:   96.20%  105.47ms        11  9.5884ms  5.7630us  105.39ms  cudaMallocManaged
      ...
Output of nvprof ./executable without said function:  
==18328== NVPROF is profiling process 18328, command: ./executable
==18328== Profiling application: ./executable
==18328== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:  100.00%  2.2080us         1  2.2080us  2.2080us  2.2080us  layerInit(unsigned int, unsigned int, unsigned int, float*, float*, float*)
      API calls:   99.50%  114.01ms        11  10.365ms  4.9390us  113.95ms  cudaMallocManaged
      ...
Compilercall: nvcc -std=c++11 -g -o executable main.cu
 
    